From c24cd2f64228db53aca493632cfd2fc64e844e22 Mon Sep 17 00:00:00 2001 From: x Date: Mon, 2 Jan 2017 23:30:16 +0100 Subject: [PATCH] . --- .gitmodules | 30 + README.md | 136 +- bitpack.c | 2 +- bitpack.h | 34 +- bitpack128v.c | 116 + bitpack128v_.h | 1998 + bitpack256v.c | 119 + bitpack256v_.h | 1998 + bitpack64_.h | 3 +- bitpackv.c | 2 +- bitpackv32_.h | 2 +- bitunpack.c | 2 +- bitunpack.h | 58 +- bitunpackv.c => bitunpack128v.c | 152 +- bitunpack128v_.h | 2002 + bitunpack256v.c | 500 + bitunpack256v_.h | 2002 + bitunpack64_.h | 5 +- bitutil.c | 101 +- bitutil.h | 123 +- conf.h | 46 +- eliasfano.c | 52 +- eliasfano.h | 26 +- ext/FastPFor | 1 + ext/LittleIntPacker | 1 + ext/MaskedVByte | 1 + ext/MaskedVByte/LICENSE | 202 - ext/MaskedVByte/include/varintdecode.h | 28 - ext/MaskedVByte/include/varintencode.h | 18 - ext/MaskedVByte/src/varintdecode.c | 1760 - ext/MaskedVByte/src/varintencode.c | 94 - ext/OPT_PFD/unpack.h | 2 +- ext/bench_/bench/codecs.h | 172 + ext/bench_/bench/common.h | 59 + ext/bench_/bench/compress.h | 29 + ext/bench_/bench/compress_opt.h | 65 + ext/bench_/bench/compress_qmx.cpp | 1573 + ext/bench_/bench/compress_qmx.h | 43 + ext/bench_/bench/compress_qmx_adcs.cpp | 6700 +++ ext/bench_/bench/compress_qmx_adcs.h | 42 + .../bench/compress_qmx_decompress.cpp} | 12180 +++--- ext/bench_/bench/compress_qmx_v2.cpp | 1468 + ext/bench_/bench/compress_qmx_v2.h | 46 + .../bench/compress_qmx_v2_decompress.cpp | 5448 +++ ext/bench_/bench/compress_qmx_v3.cpp | 1510 + ext/bench_/bench/compress_qmx_v3.h | 47 + .../bench/compress_qmx_v3_decompress.cpp | 33908 ++++++++++++++ ext/bench_/bench/compress_qmx_v4.cpp | 1527 + ext/bench_/bench/compress_qmx_v4.h | 43 + .../bench/compress_qmx_v4_decompress.cpp | 36428 ++++++++++++++++ ext/bench_/bench/compress_turbopackv.h | 33 + ext/bench_/bench/conf.h | 207 + ext/bench_/bench/util.h | 407 + ext/bitshuffle | 1 + ext/c-blosc2 | 1 + ext/ext.c | 120 - ext/for/LICENSE | 201 - ext/for/for-gen.c | 28187 ------------ ext/for/for.c | 402 - ext/for/for.h | 241 - ext/libfor | 1 + ext/libvbyte | 1 + ext/lz4 | 1 + ext/lz4.c | 1515 - ext/lz4.h | 361 - ext/polycom/optp4.c | 22 + ext/polycom/optp4.h | 11 + ext/polycom/optpfd.c | 26 + ext/polycom/optpfd.h | 11 + ext/polycom/polyvbyte.c | 14 + ext/polycom/polyvbyte.h | 10 + ext/{ => polycom}/vbyte_poly.h | 0 ext/qmx/GNUmakefile | 10 - ext/qmx/README | 16 - ext/qmx/compress_qmx.h | 22 - ext/qmx/makefile | 10 - ext/rc.c | 1811 + ext/rc.h | 8 + ext/simdcomp | 1 + ext/simdcomp/bitpacka.c | 17774 -------- ext/simdcomp/bitpacka.h | 28 - ext/simdcomp/example.c | 66 - ext/simdcomp/include/simdbitpacking.h | 21 - ext/simdcomp/include/simdcomp.h | 12 - ext/simdcomp/include/simdcomputil.h | 29 - .../include/simdintegratedbitpacking.h | 27 - ext/simdcomp/makefile | 54 - ext/simdcomp/src/simdbitpacking.c | 14009 ------ ext/simdcomp/src/simdcomputil.c | 56 - ext/simdcomp/src/simdintegratedbitpacking.c | 24872 ----------- ext/simdcomp/src/unit.c | 63 - ext/simdcomp_/simdfor.c | 14501 ++++++ ext/simple8b.c | 2 +- ext/simple8b.h | 6 + ext/streamvbyte | 1 + icbench.c | 2598 +- idx.h | 9 +- idxcr.c | 16 +- idxqry.c | 28 +- idxseg.c | 6 +- jic.c | 3 +- makefile | 235 +- plugins.cc | 1153 + plugins.h | 54 + transpose.c | 4 +- transpose.h | 2 +- vint.c | 383 +- vint.h | 260 +- vp4c.c | 236 + vp4c.h | 56 + vp4d.c | 316 + vp4d.h | 133 + vp4dc.c | 2 +- vsimple.c | 20 +- vsimple.h | 18 +- 115 files changed, 124937 insertions(+), 98671 deletions(-) create mode 100644 .gitmodules create mode 100644 bitpack128v.c create mode 100644 bitpack128v_.h create mode 100644 bitpack256v.c create mode 100644 bitpack256v_.h rename bitunpackv.c => bitunpack128v.c (55%) create mode 100644 bitunpack128v_.h create mode 100644 bitunpack256v.c create mode 100644 bitunpack256v_.h create mode 160000 ext/FastPFor create mode 160000 ext/LittleIntPacker create mode 160000 ext/MaskedVByte delete mode 100644 ext/MaskedVByte/LICENSE delete mode 100644 ext/MaskedVByte/include/varintdecode.h delete mode 100644 ext/MaskedVByte/include/varintencode.h delete mode 100644 ext/MaskedVByte/src/varintdecode.c delete mode 100644 ext/MaskedVByte/src/varintencode.c create mode 100644 ext/bench_/bench/codecs.h create mode 100644 ext/bench_/bench/common.h create mode 100644 ext/bench_/bench/compress.h create mode 100644 ext/bench_/bench/compress_opt.h create mode 100644 ext/bench_/bench/compress_qmx.cpp create mode 100644 ext/bench_/bench/compress_qmx.h create mode 100644 ext/bench_/bench/compress_qmx_adcs.cpp create mode 100644 ext/bench_/bench/compress_qmx_adcs.h rename ext/{qmx/compress_qmx.cc => bench_/bench/compress_qmx_decompress.cpp} (80%) create mode 100644 ext/bench_/bench/compress_qmx_v2.cpp create mode 100644 ext/bench_/bench/compress_qmx_v2.h create mode 100644 ext/bench_/bench/compress_qmx_v2_decompress.cpp create mode 100644 ext/bench_/bench/compress_qmx_v3.cpp create mode 100644 ext/bench_/bench/compress_qmx_v3.h create mode 100644 ext/bench_/bench/compress_qmx_v3_decompress.cpp create mode 100644 ext/bench_/bench/compress_qmx_v4.cpp create mode 100644 ext/bench_/bench/compress_qmx_v4.h create mode 100644 ext/bench_/bench/compress_qmx_v4_decompress.cpp create mode 100644 ext/bench_/bench/compress_turbopackv.h create mode 100644 ext/bench_/bench/conf.h create mode 100644 ext/bench_/bench/util.h create mode 160000 ext/bitshuffle create mode 160000 ext/c-blosc2 delete mode 100644 ext/ext.c delete mode 100644 ext/for/LICENSE delete mode 100644 ext/for/for-gen.c delete mode 100644 ext/for/for.c delete mode 100644 ext/for/for.h create mode 160000 ext/libfor create mode 160000 ext/libvbyte create mode 160000 ext/lz4 delete mode 100644 ext/lz4.c delete mode 100644 ext/lz4.h create mode 100644 ext/polycom/optp4.c create mode 100644 ext/polycom/optp4.h create mode 100644 ext/polycom/optpfd.c create mode 100644 ext/polycom/optpfd.h create mode 100644 ext/polycom/polyvbyte.c create mode 100644 ext/polycom/polyvbyte.h rename ext/{ => polycom}/vbyte_poly.h (100%) delete mode 100644 ext/qmx/GNUmakefile delete mode 100644 ext/qmx/README delete mode 100644 ext/qmx/compress_qmx.h delete mode 100644 ext/qmx/makefile create mode 100644 ext/rc.c create mode 100644 ext/rc.h create mode 160000 ext/simdcomp delete mode 100644 ext/simdcomp/bitpacka.c delete mode 100644 ext/simdcomp/bitpacka.h delete mode 100644 ext/simdcomp/example.c delete mode 100644 ext/simdcomp/include/simdbitpacking.h delete mode 100644 ext/simdcomp/include/simdcomp.h delete mode 100644 ext/simdcomp/include/simdcomputil.h delete mode 100644 ext/simdcomp/include/simdintegratedbitpacking.h delete mode 100644 ext/simdcomp/makefile delete mode 100644 ext/simdcomp/src/simdbitpacking.c delete mode 100644 ext/simdcomp/src/simdcomputil.c delete mode 100644 ext/simdcomp/src/simdintegratedbitpacking.c delete mode 100644 ext/simdcomp/src/unit.c create mode 100644 ext/simdcomp_/simdfor.c create mode 160000 ext/streamvbyte create mode 100644 plugins.cc create mode 100644 plugins.h create mode 100644 vp4c.c create mode 100644 vp4c.h create mode 100644 vp4d.c create mode 100644 vp4d.h diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..c23be3a --- /dev/null +++ b/.gitmodules @@ -0,0 +1,30 @@ +[submodule "ext/FastPFor"] + path = ext/FastPFor + url = https://github.com/lemire/FastPFor.git +[submodule "ext/libfor"] + path = ext/libfor + url = https://github.com/cruppstahl/libfor.git +[submodule "ext/lz4"] + path = ext/lz4 + url = https://github.com/Cyan4973/lz4.git +[submodule "ext/bitshuffle"] + path = ext/bitshuffle + url = https://github.com/kiyo-masui/bitshuffle.git +[submodule "ext/c-blosc2"] + path = ext/c-blosc2 + url = https://github.com/Blosc/c-blosc2.git +[submodule "ext/LittleIntPacker"] + path = ext/LittleIntPacker + url = https://github.com/lemire/LittleIntPacker.git +[submodule "ext/streamvbyte"] + path = ext/streamvbyte + url = https://github.com/lemire/streamvbyte.git +[submodule "ext/libvbyte"] + path = ext/libvbyte + url = https://github.com/cruppstahl/libvbyte.git +[submodule "ext/MaskedVByte"] + path = ext/MaskedVByte + url = https://github.com/lemire/MaskedVByte.git +[submodule "ext/simdcomp"] + path = ext/simdcomp + url = https://github.com/lemire/simdcomp.git diff --git a/README.md b/README.md index 27128ed..9369868 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ TurboPFor: Fastest Integer Compression [![Build Status](https://travis-ci.org/po + **TurboPFor: The new synonym for "integer compression"** - 100% C (C++ compatible headers), w/o inline assembly - Usage as simple as memcpy - - :+1: **Java** Critical Natives Interface. Access TurboPFor **incl. SIMD!** from Java as fast as calling from C + - :+1: **Java** Critical Native Interface. Access TurboPFor **incl. SIMD!** from Java as fast as calling from C - :sparkles: **FULL** range 16/32/64 bits integer lists and Floating point - No other "Integer Compression" compress or decompress faster with better compression - Direct Access is several times faster than other libraries @@ -13,11 +13,13 @@ TurboPFor: Fastest Integer Compression [![Build Status](https://travis-ci.org/po + **For/PFor/PForDelta** - **Novel** **"TurboPFor"** (Patched Frame-of-Reference,PFor/PForDelta) scheme with **direct access** or bulk decoding. Outstanding compression and speed. More efficient than **ANY** other fast "integer compression" scheme. - - Compress 70 times faster and decompress up to 3 times faster than OptPFD - - :new: **TurboPFor now 30%! more faster** + - Compress 70 times faster and decompress up to 4 times faster than OptPFD + - :new: **(2017) TurboPFor AVX2, now 50%! more faster!!!!** + - :new: **(2017) TurboPFor Hybrid, better compression and more faster**

+ **Bit Packing** - :sparkles: Fastest and most efficient **"SIMD Bit Packing"** + - :new: **(2017) TurboPack AVX2, now more faster. Decoding 10Billions intergers/seconds** - Scalar **"Bit Packing"** decoding as fast as SIMD-Packing in realistic (No "pure cache") scenarios - Bit Packing with **Direct/Random Access** without decompressing entire blocks - Access any single bit packed entry with **zero decompression** @@ -26,7 +28,7 @@ TurboPFor: Fastest Integer Compression [![Build Status](https://travis-ci.org/po

+ **Variable byte** - :sparkles: Scalar **"Variable Byte"** faster and more efficient than **ANY** other (incl. SIMD MaskedVByte) implementation - - :new: **now up to 25% more faster** + - :new: **(2017) new scheme w. better compression and 30% more faster**

+ **Simple family** - :sparkles: **Novel** **"Variable Simple"** (incl. **RLE**) faster and more efficient than simple16, simple-8b @@ -48,66 +50,52 @@ TurboPFor: Fastest Integer Compression [![Build Status](https://travis-ci.org/po **...forget** ~~Map Reduce, Hadoop, multi-node clusters,~~ ... ### Integer Compression Benchmark: -CPU: Sandy bridge i7-2600k at 4.2GHz, gcc 5.1, ubuntu 15.04, single thread. - Realistic and practical "integer compression" benchmark with **large** integer arrays. - No **PURE** cache benchmark -##### - Synthetic data: - - Generate and test skewed distribution (100.000.000 integers, Block size=128)
+##### - Synthetic data (2017): + - Generate and test (zipfian) skewed distribution (100.000.000 integers, Block size=128/256)
Note: Unlike general purpose compression, a small fixed size (ex. 128 integers) is in general used in "integer compression". Large blocks involved, while processing queries (inverted index, search engines, databases, graphs, in memory computing,...) need to be entirely decoded - ./icbench -a1.5 -m0 -M255 -n100m + ./icbench -a1.5 -m0 -M255 -n100m ZIPF -|Size| Ratio % |Bits/Integer |C Time MI/s |D Time MI/s |Function | -|--------:|-----:|----:|-------:|-------:|---------| -| 63.392.801| 15.85| 5.07|**388.36**|**1600.02**|**TurboPFor**| -| 63.392.801| 15.85| 5.07| 365.26| 246.93|**TurboPForDA**| -| 65.359.916| 16.34| 5.23| 7.09| 638.96|[OptPFD](#OptPFD)| -| 72.364.024| 18.09| 5.79| 85.31| 762.00|[Simple16](#Simple16)| -| 78.514.276| 19.63| 6.28| 251.34| 841.61|**VSimple**| -| 95.915.096| 23.98| 7.67| 221.46|1049.70|[Simple-8b](#Simple-8b)| -| 99.910.930| 24.98| 7.99|**2603.47**|**1948.65**|**TurboPackV**| -| 99.910.930| 24.98| 7.99| 2524.50|1943.41|[SIMDPack FPF](#FastPFor)| -| 99.910.930| 24.98| 7.99| 1883.21|1898.11|**TurboPack**| -| 99.910.930| 24.98| 7.99| 1877.25| 935.83|**TurboForDA**| -|102.074.663| 25.52| 8.17| 1993.95|1827.04|**TurboVbyte**| -|102.074.663| 25.52|8.17|1214.12|1688.95|[MaskedVByte](#MaskedVByte)| -|102.074.663| 25.52| 8.17| 1178.72| 949.59|[Vbyte FPF](#FastPFor)| -|103.035.930| 25.76| 8.24| 1480.47|1746.51|[libfor](#libfor)| -|112.500.000| 28.12| 9.00| 305.85|1899.15|[VarintG8IU](#VarintG8IU)| -|400.000.000|100.00|32.00| 1451.11|1493.46|Copy| -| | | | N/A | N/A |**EliasFano**| +CPU: Skylake i7-6700 w/ only 3.7GHz gcc 6.2 single thread + +|C Size|ratio%|Bits/Integer|C MI/s|D MI/s|Name|File| +|--------:|-----:|--------:|--------:|----------------| +|62939886| 15.7| 5.04|**392.67**|**2311.32**|**TurboPFor256**| +|63392759| 15.8| 5.07|329.70|1608.42|**TurboPFor**| +|63392801| 15.8| 5.07|326.18|230.97|**TurboPForDA**| +|65060504| 16.3| 5.20|15.77|687.13|[FP.SIMDOptPFor](#FastPFor)| +|65359916|16.34| 5.23| 7.58| 609.12|OptPFD| +|73477088|18.37| 5.88|101.68| 621.37|Simple16| +|73481096| 18.4| 5.88|155.16|2187.15|[FP.SimdFastPFor](#FastPFor)| +|76345136| 19.1| 6.11|245.21|652.78|**VSimple**| +|95915096|23.98| 7.67| 211.79|957.62|Simple-8b| +|99910930| 25.0| 7.99|**3289.58**|**2968.35**|**TurboPackV**| +|99910930| 25.0| 7.99|2122.43|2345.68|**TurboPack**| +|99910930| 25.0| 7.99|2105.47|2218.79|**TurboFor**| +|100332929| 25.1| 8.03|**3580.42**|**2998.17**|**TurboPack256V**| +|101015650| 25.3| 8.08|2380.40|2371.07|**TurboVByte**| +|101879302| 25.5| 8.15|65.16|2140.87|[QMX](#QMX)| +|102074663| 25.5| 8.17|1427.73|1979.27|[MaskedVByte](#MaskedVByte)| +|102074663| 25.5| 8.17|564.60|1052.28|[PC.Vbyte](#PolyCom)| +|102083036| 25.5| 8.17|1300.35|1067.45|[FP.VByte](#FastPFor)| +|112500000| 28.1| 9.00|381.85|**3034.90**|[VarintG8IU](#VarintG8IU)| +|125000000| 31.2|10.00|1110.68|2948.33|[StreamVbyte](#StreamVByte)| +|400000000| 100.00| 32.00| 2240.24|2237.05|Copy| +| | | | N/A | N/A |EliasFano| + MI/s: 1.000.000 integers/second. **1000 MI/s = 4 GB/s**
-**#BOLD** = pareto frontier. FPF=FastPFor
+**#BOLD** = pareto frontier.
+FP=FastPFor SC:simdcomp PC:Polycom
TurboPForDA,TurboForDA: Direct Access is normally used when accessing few individual values. -CPU: Skylake i7-6700 w/ only 3.7GHz - -|Size| Ratio % |Bits/Integer |C Time MI/s |D Time MI/s |Function | -|--------:|-----:|----:|-------:|-------:|---------| -| 63392801| 15.85| 5.07|**413.76**|**1749.87**|**TurboPFor**| -| 63392801| 15.85| 5.07| 387.30| 243.62|**TurboPForDA**| -| 65359916| 16.34| 5.23| 7.58| 609.12|OptPFD| -| 73477088| 18.37| 5.88| 101.68| 621.37|Simple16| -| 78514276| 19.63| 6.28|258.31|691.48|**VSimple**| -| 95915096| 23.98| 7.67| 211.79|957.62|Simple-8b| -| 98546814| 24.64| 7.88| 70.85|**2349.71**|[QMX](#QMX)| -| 99910930| 24.98| 7.99|**3537.57**|**3081.79**|**TurboPackV**| -| 99910930| 24.98| 7.99| 3099.52|3071.77|SIMDPack FPF| -| 99910930| 24.98| 7.99| 2095.79|2495.22|**TurboPack**| -| 99910930| 24.98| 7.99| 2049.85|2364.52|**TurboFor**| -| 99910930| 24.98| 7.99| 2049.70|1124.12|**TurboForDA**| -|102074663| 25.52| 8.17| 1825.64|1844.34|**TurboVbyte**| -|102074663| 25.52| 8.17| 1354.42|1745.69|MaskedVByte| -|102074663| 25.52| 8.17| 1249.77|1051.85|Vbyte FPF| -|112500000| 28.12| 9.00| 466.94|3003.70|VarintG8IU| -|128125000| 32.03| 10.25| 1109.67|1271.32|[StreamVbyte FPF](#FastPFor)| -|400000000| 100.00| 32.00| 2240.24|2237.05|Copy| ------------------------------------------------------------------------ -##### - Data files: - - CPU: Sandy bridge i7-2600k at 4.2GHz +##### - Data files (2016): + - CPU: Sandy bridge i7-2600k at 4.2GHz - gov2.sorted from [DocId data set](#DocId data set) Block size=128 (lz4+blosc+VSimple w/ 64Ki) @@ -115,28 +103,35 @@ CPU: Skylake i7-6700 w/ only 3.7GHz |Size |Ratio %|Bits/Integer|C Time MI/s|D Time MI/s|Function | |----------:|-----:|----:|------:|------:|---------------------| -| 3.214.763.689| 13.44| 4.30| 339.90|837.69|**VSimple 64Ki**| +| 3.319.692.190| 13.88| 4.44|**336.68**|**1410.74**|**TurboPFor**| | 3.337.758.854| 13.95| 4.47| 5.06| 513.00|OptPFD| | 3.357.673.495| 14.04| 4.49|**357.77**|**1192.14**|**TurboPFor**| | 3.501.671.314| 14.64| 4.68| 321.45| 827.01|**VSimple**| | 3.766.174.764| 15.75| 5.04|**617.88**| 712.31|**EliasFano**| | 3.820.190.182| 15.97| 5.11| 118.81| 650.21|Simple16| -| 3.958.888.197| 16.55| 5.30| 279.19| 618.60|[lz4](#lz4)+DT 64Ki| | 4.521.326.518| 18.90| 6.05| 209.17| 824.26|Simple-8b| +| 4.647.699.724| 19.43| 6.22|**889.02**|1130.50|**TurboVbyte**| | 4.683.323.301| 19.58| 6.27|**828.97**|1007.44|**TurboVbyte**| | 4.953.768.342| 20.71| 6.63|**1766.05**|**1943.87**|**TurboPackV**| | 4.953.768.342| 20.71| 6.63|1419.35|1512.86|**TurboPack**| | 5.203.353.057| 21.75| 6.96|1560.34|1806.60|SIMDPackD1 FPF| -| 6.074.995.117| 25.40| 8.13| 494.70| 729.97|[blosc_lz4](#blosc) 64Ki| | 6.221.886.390| 26.01| 8.32|1666.76|1737.72|**TurboFor**| | 6.221.886.390| 26.01| 8.32|1660.52| 565.25|**TurboForDA**| | 6.699.519.000| 28.01| 8.96| 472.01| 495.12|Vbyte FPF| | 6.700.989.563| 28.02| 8.96| 728.72| 991.57|MaskedVByte| | 7.622.896.878| 31.87|10.20| 208.73|1197.74|VarintG8IU| | 8.594.342.216| 35.93|11.50|1307.22|1593.07|libfor| -| 8.773.150.644| 36.68|11.74| 637.83|1301.05|blosc_lz 64Ki| |23.918.861.764|100.00|32.00|1456.17|1480.78|Copy| + +|Size |Ratio %|Bits/Integer|C Time MI/s|D Time MI/s|Function | +|----------:|-----:|----:|------:|------:|---------------------| +| 3.214.763.689| 13.44| 4.30| 339.90| 837.69|**VSimple 64Ki**| +| 3.958.888.197| 16.55| 5.30| 279.19| 618.60|[lz4](#lz4)+DT 64Ki| +| 6.074.995.117| 25.40| 8.13| 494.70| 729.97|[blosc_lz4](#blosc) 64Ki| +| 8.773.150.644| 36.68|11.74| 637.83|1301.05|blosc_lz 64Ki| + + Ki=1024 Integers. 64Ki = 256k bytes
"lz4+DT 64Ki" = Delta+Transpose from TurboPFor + lz4
"blosc_lz4" tested w/ lz4 compressor+vectorized shuffle @@ -180,30 +175,33 @@ using [900.000 multicore servers](https://www.cloudyn.com/blog/10-facts-didnt-kn ### Compile: *make* + or + *make AVX2=1* ### Testing: ##### - Synthetic data: - + test all "integer compression" functions
+ + benchmark "integer compression" functions
- ./icbench -a1.0 -m0 -M255 -n100m + ./icbench -eBENCH -a1.2 -m0 -M255 -n100m ZIPF + ./icbench -eBENCH/BITPACK/VBYTE -a1.2 -m0 -M255 -n100m ZIPF - >*-zipfian distribution alpha = 1.0 (Ex. -a1.0=uniform -a1.5=skewed distribution)
+ >*-zipfian distribution alpha = 1.2 (Ex. -a1.0=uniform -a1.5=skewed distribution)
-number of integers = 100.000.000
-integer range from 0 to 255
* + individual function test (ex. Copy TurboPack TurboPFor)
- ./icbench -a1.5 -m0 -M255 -ecopy/turbopack/turbopfor -n100m + ./icbench -a1.5 -m0 -M255 -ecopy/turbopack/turbopfor/turbopack256v ZIPF ##### - Data files: - - Data file Benchmark (file from [DocId data set](#DocId data set)) + - Sorted data file Benchmark (file from [DocId data set](#DocId data set)) - ./icbench -c1 gov2.sorted + ./icbench -fS gov2.sorted ##### - Intersections: @@ -294,11 +292,12 @@ header files to use with documentation:
###### Multithreading: - All TurboPFor integer compression functions are thread safe -### Libraries benchmarked: +### References: + [FastPFor](https://github.com/lemire/FastPFor) + [Simdcomp](https://github.com/lemire/simdcomp): SIMDPack FPF, Vbyte FPF, VarintG8IU, StreamVbyte + [Optimized Pfor-delta compression code](http://jinruhe.com): OptPFD/OptP4, Simple16 (limited to 28 bits integers) + [MaskedVByte](http://maskedvbyte.org/). See also: [Vectorized VByte Decoding](http://engineering.indeed.com/blog/2015/03/vectorized-vbyte-decoding-high-performance-vector-instructions/) + + [Streamvbyte](https://github.com/lemire/streamvbyte). + [Index Compression Using 64-Bit Words](http://people.eng.unimelb.edu.au/ammoffat/abstracts/am10spe.html): Simple-8b (speed optimized version tested) + [libfor](https://github.com/cruppstahl/for) + [Compression, SIMD, and Postings Lists](http://www.cs.otago.ac.nz/homepages/andrew/papers/) QMX integer compression from the "simple family" @@ -306,16 +305,15 @@ header files to use with documentation:
+ [blosc](https://github.com/Blosc/c-blosc). blosc is like transpose/shuffle+lz77. Tested blosc+lz4 and blosclz incl. vectorizeed shuffle.
+ [Document identifier data set](http://lemire.me/data/integercompression2014.html) -### References: - + **TurboPFor** - - [Optimizing communication by compression for Multi-GPU Scalable Breadth-First Searches](http://oa.upm.es/40842/) + [gpugraph500](https://github.com/UniHD-CEG/gpugraph500) - - [Small Polygon Compression](http://abhinavjauhri.com/publications/dcc_poster_2016.pdf) + [big_num Compression](https://github.com/ajauhri/bignum_compression) - - [TurboPForErl](https://github.com/johannesh/TurboPForErl) + **Integer compression publications:** + - [In Vacuo and In Situ Evaluation of SIMD Codecs (TurboPackV,TurboPFor/QMX)](http://dl.acm.org/citation.cfm?id=3015023) + [paper](http://www.cs.otago.ac.nz/homepages/andrew/papers/) - [SIMD Compression and the Intersection of Sorted Integers](http://arxiv.org/abs/1401.6399) - [Partitioned Elias-Fano Indexes](http://www.di.unipi.it/~ottavian/files/elias_fano_sigir14.pdf) - [On Inverted Index Compression for Search Engine Efficiency](http://www.dcs.gla.ac.uk/~craigm/publications/catena14compression.pdf) - [Google's Group Varint Encoding](http://static.googleusercontent.com/media/research.google.com/de//people/jeff/WSDM09-keynote.pdf) -Last update: 11 SEP 2016 + + **Applications:** + - [Graph500](https://github.com/julianromera/graph500) + - [Small Polygon Compression](http://abhinavjauhri.com/publications/dcc_poster_2016.pdf) + [code](https://github.com/ajauhri/bignum_compression) +Last update: 02 JAN 2017 diff --git a/bitpack.c b/bitpack.c index 682fb32..8d7b61e 100644 --- a/bitpack.c +++ b/bitpack.c @@ -1,5 +1,5 @@ /** - Copyright (C) powturbo 2013-2015 + Copyright (C) powturbo 2013-2017 GPL v2 License This program is free software; you can redistribute it and/or modify diff --git a/bitpack.h b/bitpack.h index c84acff..4e0e281 100644 --- a/bitpack.h +++ b/bitpack.h @@ -1,5 +1,5 @@ /** - Copyright (C) powturbo 2013-2015 + Copyright (C) powturbo 2013-2017 GPL v2 License This program is free software; you can redistribute it and/or modify @@ -28,30 +28,32 @@ extern "C" { #endif #include // bipackNN: Pack array with n unsigned (NN bits in[n]) values to the buffer out using nbits per value. Return value = end of compressed buffer out -unsigned char *bitpack32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out , unsigned b); -unsigned char *bitpack16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out , unsigned b); -unsigned char *bitpack64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out , unsigned b); +unsigned char *bitpack32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out , unsigned b); +unsigned char *bitpack16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out , unsigned b); +unsigned char *bitpack64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out , unsigned b); // delta bit packing -unsigned char *bitdpack32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b); -unsigned char *bitd1pack32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b); +unsigned char *bitdpack32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b); +unsigned char *bitd1pack32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b); // for bit packing -unsigned char *bitfpack32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b); -unsigned char *bitf1pack32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b); +unsigned char *bitfpack32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b); +unsigned char *bitf1pack32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b); // zigzag -unsigned char *bitzpack32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b); +unsigned char *bitzpack32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b); //-------------------------------------- SIMD ------------------------------------------------------------------------------------------ -// Pack array with n unsigned (32 bits in[n]) values to the buffer out using nbits per value. Return value = end of compressed buffer out -unsigned char *bitpackv32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out , unsigned b); -unsigned char *bitdpackv32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b); -unsigned char *bitd1packv32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b); -unsigned char *bitzpackv32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b); +// Pack array with 128 unsigned (32 bits in[n]) values to the buffer out using nbits per value. Return value = end of compressed buffer out +unsigned char *bitpack128v32( unsigned *__restrict in, unsigned char *__restrict out , unsigned b); +unsigned char *bitdpack128v32( unsigned *__restrict in, unsigned char *__restrict out, unsigned start, unsigned b); +unsigned char *bitd1pack128v32(unsigned *__restrict in, unsigned char *__restrict out, unsigned start, unsigned b); +unsigned char *bitzpack128v32( unsigned *__restrict in, unsigned char *__restrict out, unsigned start, unsigned b); -// like bitpack32 but for 16 bits arrays -unsigned char *bitpackv16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out , unsigned b); +unsigned char *bitpack256v32( unsigned *__restrict in, unsigned char *__restrict out , unsigned b); +unsigned char *bitdpack256v32( unsigned *__restrict in, unsigned char *__restrict out, unsigned start, unsigned b); +unsigned char *bitd1pack256v32(unsigned *__restrict in, unsigned char *__restrict out, unsigned start, unsigned b); +unsigned char *bitzpack256v32( unsigned *__restrict in, unsigned char *__restrict out, unsigned start, unsigned b); #ifdef __cplusplus } diff --git a/bitpack128v.c b/bitpack128v.c new file mode 100644 index 0000000..1bdf454 --- /dev/null +++ b/bitpack128v.c @@ -0,0 +1,116 @@ +/** + Copyright (C) powturbo 2013-2017 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - homepage : https://sites.google.com/site/powturbo/ + - github : https://github.com/powturbo + - twitter : https://twitter.com/powturbo + - email : powturbo [_AT_] gmail [_DOT_] com +**/ +// "Integer Compression" SIMD bit packing + #ifndef VSTI +#include +#include "bitpack.h" +#include "bitutil.h" + +#define PAD8(__x) (((__x)+8-1)/8) + +#define VSTI(ip, i, iv, parm) +#define IPP(ip, i, iv) _mm_loadu_si128(ip++) +#include __FILE__ + +unsigned char *bitpack128v32(unsigned *__restrict in, unsigned char *__restrict out, unsigned b) { unsigned char *pout = out+PAD8(128*b); BITPACK128V32(in, b, out, 0); return pout; } +//unsigned char *bitpack128v16(unsigned short *__restrict in, unsigned char *__restrict out, unsigned b) { unsigned char *pout = out+PAD8(n*b); BITPACK128V32(in, n, b, out, 0); return pout; } +#undef VSTI +#undef IPP + +//------------------------------------------------------------------------------------------------------------------------------ +#define VSTI(__ip, __i, __iv, __sv) v = _mm_loadu_si128(__ip++); __iv = DELTA128x32(v,__sv); __sv = v +#define IPP(ip, i, __iv) __iv +#include __FILE__ + +unsigned char *bitdpack128v32(unsigned *__restrict in, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(128*b); + __m128i v,sv = _mm_set1_epi32(start); + BITPACK128V32(in, b, out, sv); + return pout; +} +#undef VSTI + +//------------------------------------------------------------------------------------------------------------------------------ +#define VSTI(__ip, __i, __iv, __sv) v = _mm_loadu_si128(__ip++); __iv = _mm_sub_epi32(DELTA128x32(v,__sv),cv); __sv = v + +unsigned char *bitd1pack128v32(unsigned *__restrict in, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(128*b); + __m128i v, sv = _mm_set1_epi32(start), cv = _mm_set1_epi32(1); + BITPACK128V32(in, b, out, sv); return pout; +} +#undef VSTI +//------------------------------------------------------------------------------------------------------------------------------ +#define VSTI(__ip, __i, __iv, __sv) v = _mm_loadu_si128(__ip++); __iv = DELTA128x32(v,__sv); __sv = v; __iv = ZIGZAG128x32(__iv) + +unsigned char *bitzpack128v32(unsigned *__restrict in, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(128*b); + __m128i v, sv = _mm_set1_epi32(start), cv = _mm_set1_epi32(1); + BITPACK128V32(in, b, out, sv); + return pout; +} +#undef VSTI + #else +#include +#include + +#define OPPE(__op) +#define IPPE(__op) + +#include "bitpack128v_.h" + +#define BITPACK128V32(__pip, __nbits, __pop, __parm) { __m128i *__ip=(__m128i *)__pip,*__op=(__m128i *)__pop;\ + switch(__nbits) {\ + case 0: break;\ + case 1:{ BITPACK128V32_1( __ip, __op, __parm); } break;\ + case 2:{ BITPACK128V32_2( __ip, __op, __parm); } break;\ + case 3:{ BITPACK128V32_3( __ip, __op, __parm); } break;\ + case 4:{ BITPACK128V32_4( __ip, __op, __parm); } break;\ + case 5:{ BITPACK128V32_5( __ip, __op, __parm); } break;\ + case 6:{ BITPACK128V32_6( __ip, __op, __parm); } break;\ + case 7:{ BITPACK128V32_7( __ip, __op, __parm); } break;\ + case 8:{ BITPACK128V32_8( __ip, __op, __parm); } break;\ + case 9:{ BITPACK128V32_9( __ip, __op, __parm); } break;\ + case 10:{ BITPACK128V32_10(__ip, __op, __parm); } break;\ + case 11:{ BITPACK128V32_11(__ip, __op, __parm); } break;\ + case 12:{ BITPACK128V32_12(__ip, __op, __parm); } break;\ + case 13:{ BITPACK128V32_13(__ip, __op, __parm); } break;\ + case 14:{ BITPACK128V32_14(__ip, __op, __parm); } break;\ + case 15:{ BITPACK128V32_15(__ip, __op, __parm); } break;\ + case 16:{ BITPACK128V32_16(__ip, __op, __parm); } break;\ + case 17:{ BITPACK128V32_17(__ip, __op, __parm); } break;\ + case 18:{ BITPACK128V32_18(__ip, __op, __parm); } break;\ + case 19:{ BITPACK128V32_19(__ip, __op, __parm); } break;\ + case 20:{ BITPACK128V32_20(__ip, __op, __parm); } break;\ + case 21:{ BITPACK128V32_21(__ip, __op, __parm); } break;\ + case 22:{ BITPACK128V32_22(__ip, __op, __parm); } break;\ + case 23:{ BITPACK128V32_23(__ip, __op, __parm); } break;\ + case 24:{ BITPACK128V32_24(__ip, __op, __parm); } break;\ + case 25:{ BITPACK128V32_25(__ip, __op, __parm); } break;\ + case 26:{ BITPACK128V32_26(__ip, __op, __parm); } break;\ + case 27:{ BITPACK128V32_27(__ip, __op, __parm); } break;\ + case 28:{ BITPACK128V32_28(__ip, __op, __parm); } break;\ + case 29:{ BITPACK128V32_29(__ip, __op, __parm); } break;\ + case 30:{ BITPACK128V32_30(__ip, __op, __parm); } break;\ + case 31:{ BITPACK128V32_31(__ip, __op, __parm); } break;\ + case 32:{ BITPACK128V32_32(__ip, __op, __parm); } break;\ + }\ +} + #endif diff --git a/bitpack128v_.h b/bitpack128v_.h new file mode 100644 index 0000000..0fbea52 --- /dev/null +++ b/bitpack128v_.h @@ -0,0 +1,1998 @@ +/** + Copyright (C) powturbo 2013-2017 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - homepage : https://sites.google.com/site/powturbo/ + - github : https://github.com/powturbo + - twitter : https://twitter.com/powturbo + - email : powturbo [_AT_] gmail [_DOT_] com +**/ +// TurboPFor: Integer Compression SIMD bit packing +#define BITBLK128V32_1(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 1, iv), 1));\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 2, iv), 2));\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 3, iv), 3));\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 4, iv), 4));\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 5, iv), 5));\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 6, iv), 6));\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 7, iv), 7));\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 8, iv), 8));\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 9, iv), 9));\ + VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+10, iv), 10));\ + VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+11, iv), 11));\ + VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+12, iv), 12));\ + VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+13, iv), 13));\ + VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+14, iv), 14));\ + VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+15, iv), 15));\ + VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+16, iv), 16));\ + VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+17, iv), 17));\ + VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+18, iv), 18));\ + VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+19, iv), 19));\ + VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+20, iv), 20));\ + VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+21, iv), 21));\ + VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+22, iv), 22));\ + VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+23, iv), 23));\ + VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+24, iv), 24));\ + VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+25, iv), 25));\ + VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+26, iv), 26));\ + VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+27, iv), 27));\ + VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+28, iv), 28));\ + VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+29, iv), 29));\ + VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+30, iv), 30));\ + VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+31, iv), 31)); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_1(ip, op, parm) {\ + BITBLK128V32_1(ip, 0, op, parm); IPPE(ip); OPPE(op += 1*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_2(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*16+ 0, iv, parm); ov = IPP(ip, i*16+ 0, iv);\ + VSTI(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 1, iv), 2));\ + VSTI(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 2, iv), 4));\ + VSTI(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 3, iv), 6));\ + VSTI(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 4, iv), 8));\ + VSTI(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 5, iv), 10));\ + VSTI(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 6, iv), 12));\ + VSTI(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 7, iv), 14));\ + VSTI(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 8, iv), 16));\ + VSTI(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 9, iv), 18));\ + VSTI(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+10, iv), 20));\ + VSTI(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+11, iv), 22));\ + VSTI(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+12, iv), 24));\ + VSTI(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+13, iv), 26));\ + VSTI(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+14, iv), 28));\ + VSTI(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+15, iv), 30)); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_2(ip, op, parm) {\ + BITBLK128V32_2(ip, 0, op, parm);\ + BITBLK128V32_2(ip, 1, op, parm); IPPE(ip); OPPE(op += 2*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_3(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 1, iv), 3));\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 2, iv), 6));\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 3, iv), 9));\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 4, iv), 12));\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 5, iv), 15));\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 6, iv), 18));\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 7, iv), 21));\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 8, iv), 24));\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 9, iv), 27));\ + VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+11, iv), 1));\ + VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+12, iv), 4));\ + VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+13, iv), 7));\ + VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+14, iv), 10));\ + VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+15, iv), 13));\ + VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+16, iv), 16));\ + VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+17, iv), 19));\ + VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+18, iv), 22));\ + VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+19, iv), 25));\ + VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+20, iv), 28));\ + VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+22, iv), 2));\ + VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+23, iv), 5));\ + VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+24, iv), 8));\ + VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+25, iv), 11));\ + VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+26, iv), 14));\ + VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+27, iv), 17));\ + VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+28, iv), 20));\ + VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+29, iv), 23));\ + VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+30, iv), 26));\ + VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+31, iv), 29)); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_3(ip, op, parm) {\ + BITBLK128V32_3(ip, 0, op, parm); IPPE(ip); OPPE(op += 3*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_4(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*8+ 0, iv, parm); ov = IPP(ip, i*8+ 0, iv);\ + VSTI(ip, i*8+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*8+ 1, iv), 4));\ + VSTI(ip, i*8+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*8+ 2, iv), 8));\ + VSTI(ip, i*8+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*8+ 3, iv), 12));\ + VSTI(ip, i*8+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*8+ 4, iv), 16));\ + VSTI(ip, i*8+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*8+ 5, iv), 20));\ + VSTI(ip, i*8+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*8+ 6, iv), 24));\ + VSTI(ip, i*8+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*8+ 7, iv), 28)); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_4(ip, op, parm) {\ + BITBLK128V32_4(ip, 0, op, parm);\ + BITBLK128V32_4(ip, 1, op, parm);\ + BITBLK128V32_4(ip, 2, op, parm);\ + BITBLK128V32_4(ip, 3, op, parm); IPPE(ip); OPPE(op += 4*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_5(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 1, iv), 5));\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 2, iv), 10));\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 3, iv), 15));\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 4, iv), 20));\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 5, iv), 25));\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 7, iv), 3));\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 8, iv), 8));\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 9, iv), 13));\ + VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+10, iv), 18));\ + VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+11, iv), 23));\ + VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+13, iv), 1));\ + VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+14, iv), 6));\ + VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+15, iv), 11));\ + VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+16, iv), 16));\ + VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+17, iv), 21));\ + VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+18, iv), 26));\ + VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+20, iv), 4));\ + VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+21, iv), 9));\ + VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+22, iv), 14));\ + VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+23, iv), 19));\ + VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+24, iv), 24));\ + VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+26, iv), 2));\ + VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+27, iv), 7));\ + VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+28, iv), 12));\ + VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+29, iv), 17));\ + VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+30, iv), 22));\ + VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+31, iv), 27)); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_5(ip, op, parm) {\ + BITBLK128V32_5(ip, 0, op, parm); IPPE(ip); OPPE(op += 5*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_6(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*16+ 0, iv, parm); ov = IPP(ip, i*16+ 0, iv);\ + VSTI(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 1, iv), 6));\ + VSTI(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 2, iv), 12));\ + VSTI(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 3, iv), 18));\ + VSTI(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 4, iv), 24));\ + VSTI(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 6, iv), 4));\ + VSTI(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 7, iv), 10));\ + VSTI(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 8, iv), 16));\ + VSTI(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 9, iv), 22));\ + VSTI(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+10, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+11, iv), 2));\ + VSTI(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+12, iv), 8));\ + VSTI(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+13, iv), 14));\ + VSTI(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+14, iv), 20));\ + VSTI(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+15, iv), 26)); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_6(ip, op, parm) {\ + BITBLK128V32_6(ip, 0, op, parm);\ + BITBLK128V32_6(ip, 1, op, parm); IPPE(ip); OPPE(op += 6*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_7(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 1, iv), 7));\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 2, iv), 14));\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 3, iv), 21));\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 5, iv), 3));\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 6, iv), 10));\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 7, iv), 17));\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 8, iv), 24));\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+10, iv), 6));\ + VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+11, iv), 13));\ + VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+12, iv), 20));\ + VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+14, iv), 2));\ + VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+15, iv), 9));\ + VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+16, iv), 16));\ + VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+17, iv), 23));\ + VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+19, iv), 5));\ + VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+20, iv), 12));\ + VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+21, iv), 19));\ + VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+23, iv), 1));\ + VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+24, iv), 8));\ + VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+25, iv), 15));\ + VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+26, iv), 22));\ + VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+28, iv), 4));\ + VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+29, iv), 11));\ + VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+30, iv), 18));\ + VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+31, iv), 25)); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_7(ip, op, parm) {\ + BITBLK128V32_7(ip, 0, op, parm); IPPE(ip); OPPE(op += 7*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_8(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*4+ 0, iv, parm); ov = IPP(ip, i*4+ 0, iv);\ + VSTI(ip, i*4+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*4+ 1, iv), 8));\ + VSTI(ip, i*4+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*4+ 2, iv), 16));\ + VSTI(ip, i*4+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*4+ 3, iv), 24)); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_8(ip, op, parm) {\ + BITBLK128V32_8(ip, 0, op, parm);\ + BITBLK128V32_8(ip, 1, op, parm);\ + BITBLK128V32_8(ip, 2, op, parm);\ + BITBLK128V32_8(ip, 3, op, parm);\ + BITBLK128V32_8(ip, 4, op, parm);\ + BITBLK128V32_8(ip, 5, op, parm);\ + BITBLK128V32_8(ip, 6, op, parm);\ + BITBLK128V32_8(ip, 7, op, parm); IPPE(ip); OPPE(op += 8*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_9(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 1, iv), 9));\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 2, iv), 18));\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 4, iv), 4));\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 5, iv), 13));\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 6, iv), 22));\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 8, iv), 8));\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 9, iv), 17));\ + VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+11, iv), 3));\ + VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+12, iv), 12));\ + VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+13, iv), 21));\ + VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+15, iv), 7));\ + VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+16, iv), 16));\ + VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+18, iv), 2));\ + VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+19, iv), 11));\ + VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+20, iv), 20));\ + VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+22, iv), 6));\ + VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+23, iv), 15));\ + VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+25, iv), 1));\ + VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+26, iv), 10));\ + VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+27, iv), 19));\ + VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+29, iv), 5));\ + VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+30, iv), 14));\ + VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+31, iv), 23)); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_9(ip, op, parm) {\ + BITBLK128V32_9(ip, 0, op, parm); IPPE(ip); OPPE(op += 9*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_10(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*16+ 0, iv, parm); ov = IPP(ip, i*16+ 0, iv);\ + VSTI(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 1, iv), 10));\ + VSTI(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 2, iv), 20));\ + VSTI(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 4, iv), 8));\ + VSTI(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 5, iv), 18));\ + VSTI(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 7, iv), 6));\ + VSTI(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 8, iv), 16));\ + VSTI(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+10, iv), 4));\ + VSTI(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+11, iv), 14));\ + VSTI(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+12, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+13, iv), 2));\ + VSTI(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+14, iv), 12));\ + VSTI(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+15, iv), 22)); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_10(ip, op, parm) {\ + BITBLK128V32_10(ip, 0, op, parm);\ + BITBLK128V32_10(ip, 1, op, parm); IPPE(ip); OPPE(op += 10*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_11(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 1, iv), 11));\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 3, iv), 1));\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 4, iv), 12));\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 6, iv), 2));\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 7, iv), 13));\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 9, iv), 3));\ + VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+10, iv), 14));\ + VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+12, iv), 4));\ + VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+13, iv), 15));\ + VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+15, iv), 5));\ + VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+16, iv), 16));\ + VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+18, iv), 6));\ + VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+19, iv), 17));\ + VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+21, iv), 7));\ + VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+22, iv), 18));\ + VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+24, iv), 8));\ + VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+25, iv), 19));\ + VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+27, iv), 9));\ + VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+28, iv), 20));\ + VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+30, iv), 10));\ + VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+31, iv), 21)); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_11(ip, op, parm) {\ + BITBLK128V32_11(ip, 0, op, parm); IPPE(ip); OPPE(op += 11*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_12(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*8+ 0, iv, parm); ov = IPP(ip, i*8+ 0, iv);\ + VSTI(ip, i*8+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*8+ 1, iv), 12));\ + VSTI(ip, i*8+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 2, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*8+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*8+ 3, iv), 4));\ + VSTI(ip, i*8+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*8+ 4, iv), 16));\ + VSTI(ip, i*8+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 5, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*8+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*8+ 6, iv), 8));\ + VSTI(ip, i*8+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*8+ 7, iv), 20)); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_12(ip, op, parm) {\ + BITBLK128V32_12(ip, 0, op, parm);\ + BITBLK128V32_12(ip, 1, op, parm);\ + BITBLK128V32_12(ip, 2, op, parm);\ + BITBLK128V32_12(ip, 3, op, parm); IPPE(ip); OPPE(op += 12*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_13(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 1, iv), 13));\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 3, iv), 7));\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 5, iv), 1));\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 6, iv), 14));\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 8, iv), 8));\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+10, iv), 2));\ + VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+11, iv), 15));\ + VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+13, iv), 9));\ + VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+15, iv), 3));\ + VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+16, iv), 16));\ + VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+18, iv), 10));\ + VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+20, iv), 4));\ + VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+21, iv), 17));\ + VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+23, iv), 11));\ + VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+25, iv), 5));\ + VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+26, iv), 18));\ + VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+28, iv), 12));\ + VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+30, iv), 6));\ + VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+31, iv), 19)); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_13(ip, op, parm) {\ + BITBLK128V32_13(ip, 0, op, parm); IPPE(ip); OPPE(op += 13*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_14(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*16+ 0, iv, parm); ov = IPP(ip, i*16+ 0, iv);\ + VSTI(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 1, iv), 14));\ + VSTI(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 3, iv), 10));\ + VSTI(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 5, iv), 6));\ + VSTI(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 7, iv), 2));\ + VSTI(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 8, iv), 16));\ + VSTI(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+10, iv), 12));\ + VSTI(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+11, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+12, iv), 8));\ + VSTI(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+13, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+14, iv), 4));\ + VSTI(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+15, iv), 18)); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_14(ip, op, parm) {\ + BITBLK128V32_14(ip, 0, op, parm);\ + BITBLK128V32_14(ip, 1, op, parm); IPPE(ip); OPPE(op += 14*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_15(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 1, iv), 15));\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 3, iv), 13));\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 5, iv), 11));\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 7, iv), 9));\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 9, iv), 7));\ + VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+11, iv), 5));\ + VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+13, iv), 3));\ + VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+15, iv), 1));\ + VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+16, iv), 16));\ + VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+18, iv), 14));\ + VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+20, iv), 12));\ + VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+22, iv), 10));\ + VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+24, iv), 8));\ + VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+26, iv), 6));\ + VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+28, iv), 4));\ + VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+30, iv), 2));\ + VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+31, iv), 17)); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_15(ip, op, parm) {\ + BITBLK128V32_15(ip, 0, op, parm); IPPE(ip); OPPE(op += 15*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_16(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*2+ 0, iv, parm); ov = IPP(ip, i*2+ 0, iv);\ + VSTI(ip, i*2+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*2+ 1, iv), 16)); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_16(ip, op, parm) {\ + BITBLK128V32_16(ip, 0, op, parm);\ + BITBLK128V32_16(ip, 1, op, parm);\ + BITBLK128V32_16(ip, 2, op, parm);\ + BITBLK128V32_16(ip, 3, op, parm);\ + BITBLK128V32_16(ip, 4, op, parm);\ + BITBLK128V32_16(ip, 5, op, parm);\ + BITBLK128V32_16(ip, 6, op, parm);\ + BITBLK128V32_16(ip, 7, op, parm);\ + BITBLK128V32_16(ip, 8, op, parm);\ + BITBLK128V32_16(ip, 9, op, parm);\ + BITBLK128V32_16(ip, 10, op, parm);\ + BITBLK128V32_16(ip, 11, op, parm);\ + BITBLK128V32_16(ip, 12, op, parm);\ + BITBLK128V32_16(ip, 13, op, parm);\ + BITBLK128V32_16(ip, 14, op, parm);\ + BITBLK128V32_16(ip, 15, op, parm); IPPE(ip); OPPE(op += 16*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_17(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 2, iv), 2));\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 4, iv), 4));\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 6, iv), 6));\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 8, iv), 8));\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+10, iv), 10));\ + VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+12, iv), 12));\ + VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+14, iv), 14));\ + VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+17, iv), 1));\ + VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+19, iv), 3));\ + VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+21, iv), 5));\ + VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+23, iv), 7));\ + VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+25, iv), 9));\ + VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+27, iv), 11));\ + VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+29, iv), 13));\ + VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+31, iv), 15)); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_17(ip, op, parm) {\ + BITBLK128V32_17(ip, 0, op, parm); IPPE(ip); OPPE(op += 17*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_18(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*16+ 0, iv, parm); ov = IPP(ip, i*16+ 0, iv);\ + VSTI(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 2, iv), 4));\ + VSTI(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 4, iv), 8));\ + VSTI(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 6, iv), 12));\ + VSTI(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 9, iv), 2));\ + VSTI(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+10, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+11, iv), 6));\ + VSTI(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+12, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+13, iv), 10));\ + VSTI(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+14, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+15, iv), 14)); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_18(ip, op, parm) {\ + BITBLK128V32_18(ip, 0, op, parm);\ + BITBLK128V32_18(ip, 1, op, parm); IPPE(ip); OPPE(op += 18*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_19(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 2, iv), 6));\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 4, iv), 12));\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 7, iv), 5));\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 9, iv), 11));\ + VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+12, iv), 4));\ + VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+14, iv), 10));\ + VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+17, iv), 3));\ + VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+19, iv), 9));\ + VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+22, iv), 2));\ + VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+24, iv), 8));\ + VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+27, iv), 1));\ + VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+29, iv), 7));\ + VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+31, iv), 13)); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_19(ip, op, parm) {\ + BITBLK128V32_19(ip, 0, op, parm); IPPE(ip); OPPE(op += 19*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_20(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*8+ 0, iv, parm); ov = IPP(ip, i*8+ 0, iv);\ + VSTI(ip, i*8+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 1, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*8+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*8+ 2, iv), 8));\ + VSTI(ip, i*8+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 3, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*8+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 4, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*8+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*8+ 5, iv), 4));\ + VSTI(ip, i*8+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 6, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*8+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*8+ 7, iv), 12)); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_20(ip, op, parm) {\ + BITBLK128V32_20(ip, 0, op, parm);\ + BITBLK128V32_20(ip, 1, op, parm);\ + BITBLK128V32_20(ip, 2, op, parm);\ + BITBLK128V32_20(ip, 3, op, parm); IPPE(ip); OPPE(op += 20*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_21(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 2, iv), 10));\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 5, iv), 9));\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 8, iv), 8));\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+11, iv), 7));\ + VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+14, iv), 6));\ + VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+17, iv), 5));\ + VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+20, iv), 4));\ + VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+23, iv), 3));\ + VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+26, iv), 2));\ + VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+29, iv), 1));\ + VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+31, iv), 11)); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_21(ip, op, parm) {\ + BITBLK128V32_21(ip, 0, op, parm); IPPE(ip); OPPE(op += 21*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_22(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*16+ 0, iv, parm); ov = IPP(ip, i*16+ 0, iv);\ + VSTI(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 3, iv), 2));\ + VSTI(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ + VSTI(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 6, iv), 4));\ + VSTI(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 9, iv), 6));\ + VSTI(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+10, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+11, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+12, iv), 8));\ + VSTI(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+13, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+14, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+15, iv), 10)); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_22(ip, op, parm) {\ + BITBLK128V32_22(ip, 0, op, parm);\ + BITBLK128V32_22(ip, 1, op, parm); IPPE(ip); OPPE(op += 22*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_23(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 3, iv), 5));\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 7, iv), 1));\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+10, iv), 6));\ + VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+14, iv), 2));\ + VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+17, iv), 7));\ + VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+21, iv), 3));\ + VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+24, iv), 8));\ + VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+28, iv), 4));\ + VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+31, iv), 9)); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_23(ip, op, parm) {\ + BITBLK128V32_23(ip, 0, op, parm); IPPE(ip); OPPE(op += 23*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_24(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*4+ 0, iv, parm); ov = IPP(ip, i*4+ 0, iv);\ + VSTI(ip, i*4+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*4+ 1, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*4+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*4+ 2, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*4+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*4+ 3, iv), 8)); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_24(ip, op, parm) {\ + BITBLK128V32_24(ip, 0, op, parm);\ + BITBLK128V32_24(ip, 1, op, parm);\ + BITBLK128V32_24(ip, 2, op, parm);\ + BITBLK128V32_24(ip, 3, op, parm);\ + BITBLK128V32_24(ip, 4, op, parm);\ + BITBLK128V32_24(ip, 5, op, parm);\ + BITBLK128V32_24(ip, 6, op, parm);\ + BITBLK128V32_24(ip, 7, op, parm); IPPE(ip); OPPE(op += 24*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_25(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 4, iv), 4));\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 9, iv), 1));\ + VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+13, iv), 5));\ + VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+18, iv), 2));\ + VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+22, iv), 6));\ + VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+27, iv), 3));\ + VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+31, iv), 7)); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_25(ip, op, parm) {\ + BITBLK128V32_25(ip, 0, op, parm); IPPE(ip); OPPE(op += 25*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_26(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*16+ 0, iv, parm); ov = IPP(ip, i*16+ 0, iv);\ + VSTI(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ + VSTI(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 5, iv), 2));\ + VSTI(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ + VSTI(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+10, iv), 4));\ + VSTI(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+11, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+12, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+13, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+14, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+15, iv), 6)); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_26(ip, op, parm) {\ + BITBLK128V32_26(ip, 0, op, parm);\ + BITBLK128V32_26(ip, 1, op, parm); IPPE(ip); OPPE(op += 26*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_27(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 6, iv), 2));\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+12, iv), 4));\ + VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+19, iv), 1));\ + VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+25, iv), 3));\ + VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+31, iv), 5)); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_27(ip, op, parm) {\ + BITBLK128V32_27(ip, 0, op, parm); IPPE(ip); OPPE(op += 27*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_28(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*8+ 0, iv, parm); ov = IPP(ip, i*8+ 0, iv);\ + VSTI(ip, i*8+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 1, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*8+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 2, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*8+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 3, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*8+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 4, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*8+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 5, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*8+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 6, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*8+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*8+ 7, iv), 4)); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_28(ip, op, parm) {\ + BITBLK128V32_28(ip, 0, op, parm);\ + BITBLK128V32_28(ip, 1, op, parm);\ + BITBLK128V32_28(ip, 2, op, parm);\ + BITBLK128V32_28(ip, 3, op, parm); IPPE(ip); OPPE(op += 28*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_29(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+10, iv), 2));\ + VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+21, iv), 1));\ + VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+31, iv), 3)); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_29(ip, op, parm) {\ + BITBLK128V32_29(ip, 0, op, parm); IPPE(ip); OPPE(op += 29*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_30(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*16+ 0, iv, parm); ov = IPP(ip, i*16+ 0, iv);\ + VSTI(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ + VSTI(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+10, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+11, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ + VSTI(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+12, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+13, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ + VSTI(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+14, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ + VSTI(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+15, iv), 2)); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_30(ip, op, parm) {\ + BITBLK128V32_30(ip, 0, op, parm);\ + BITBLK128V32_30(ip, 1, op, parm); IPPE(ip); OPPE(op += 30*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_31(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 29);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+31, iv), 1)); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_31(ip, op, parm) {\ + BITBLK128V32_31(ip, 0, op, parm); IPPE(ip); OPPE(op += 31*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_32(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*1+ 0, iv, parm); ov = IPP(ip, i*1+ 0, iv); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_32(ip, op, parm) {\ + BITBLK128V32_32(ip, 0, op, parm);\ + BITBLK128V32_32(ip, 1, op, parm);\ + BITBLK128V32_32(ip, 2, op, parm);\ + BITBLK128V32_32(ip, 3, op, parm);\ + BITBLK128V32_32(ip, 4, op, parm);\ + BITBLK128V32_32(ip, 5, op, parm);\ + BITBLK128V32_32(ip, 6, op, parm);\ + BITBLK128V32_32(ip, 7, op, parm);\ + BITBLK128V32_32(ip, 8, op, parm);\ + BITBLK128V32_32(ip, 9, op, parm);\ + BITBLK128V32_32(ip, 10, op, parm);\ + BITBLK128V32_32(ip, 11, op, parm);\ + BITBLK128V32_32(ip, 12, op, parm);\ + BITBLK128V32_32(ip, 13, op, parm);\ + BITBLK128V32_32(ip, 14, op, parm);\ + BITBLK128V32_32(ip, 15, op, parm);\ + BITBLK128V32_32(ip, 16, op, parm);\ + BITBLK128V32_32(ip, 17, op, parm);\ + BITBLK128V32_32(ip, 18, op, parm);\ + BITBLK128V32_32(ip, 19, op, parm);\ + BITBLK128V32_32(ip, 20, op, parm);\ + BITBLK128V32_32(ip, 21, op, parm);\ + BITBLK128V32_32(ip, 22, op, parm);\ + BITBLK128V32_32(ip, 23, op, parm);\ + BITBLK128V32_32(ip, 24, op, parm);\ + BITBLK128V32_32(ip, 25, op, parm);\ + BITBLK128V32_32(ip, 26, op, parm);\ + BITBLK128V32_32(ip, 27, op, parm);\ + BITBLK128V32_32(ip, 28, op, parm);\ + BITBLK128V32_32(ip, 29, op, parm);\ + BITBLK128V32_32(ip, 30, op, parm);\ + BITBLK128V32_32(ip, 31, op, parm); IPPE(ip); OPPE(op += 32*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_33(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 1)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 31);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 29);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+31, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_33(ip, op, parm) {\ + BITBLK128V32_33(ip, 0, op, parm); IPPE(ip); OPPE(op += 33*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_34(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*16+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ + VSTI(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ + VSTI(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ + VSTI(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ + VSTI(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ + VSTI(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ + VSTI(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+10, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+11, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+12, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+13, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+14, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+15, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_34(ip, op, parm) {\ + BITBLK128V32_34(ip, 0, op, parm);\ + BITBLK128V32_34(ip, 1, op, parm); IPPE(ip); OPPE(op += 34*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_35(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 29);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 1)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 31);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+31, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_35(ip, op, parm) {\ + BITBLK128V32_35(ip, 0, op, parm); IPPE(ip); OPPE(op += 35*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_36(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*8+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ + VSTI(ip, i*8+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 1, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ + VSTI(ip, i*8+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 2, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*8+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 3, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*8+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 4, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*8+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 5, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*8+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 6, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*8+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 7, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_36(ip, op, parm) {\ + BITBLK128V32_36(ip, 0, op, parm);\ + BITBLK128V32_36(ip, 1, op, parm);\ + BITBLK128V32_36(ip, 2, op, parm);\ + BITBLK128V32_36(ip, 3, op, parm); IPPE(ip); OPPE(op += 36*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_37(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 29);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 1)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 31);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+31, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_37(ip, op, parm) {\ + BITBLK128V32_37(ip, 0, op, parm); IPPE(ip); OPPE(op += 37*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_38(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*16+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ + VSTI(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ + VSTI(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ + VSTI(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ + VSTI(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+10, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+11, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ + VSTI(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+12, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+13, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ + VSTI(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+14, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+15, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_38(ip, op, parm) {\ + BITBLK128V32_38(ip, 0, op, parm);\ + BITBLK128V32_38(ip, 1, op, parm); IPPE(ip); OPPE(op += 38*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_39(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 29);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 1)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 31);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+31, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_39(ip, op, parm) {\ + BITBLK128V32_39(ip, 0, op, parm); IPPE(ip); OPPE(op += 39*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_40(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*4+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*4+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ + VSTI(ip, i*4+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*4+ 1, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*4+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*4+ 2, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*4+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*4+ 3, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_40(ip, op, parm) {\ + BITBLK128V32_40(ip, 0, op, parm);\ + BITBLK128V32_40(ip, 1, op, parm);\ + BITBLK128V32_40(ip, 2, op, parm);\ + BITBLK128V32_40(ip, 3, op, parm);\ + BITBLK128V32_40(ip, 4, op, parm);\ + BITBLK128V32_40(ip, 5, op, parm);\ + BITBLK128V32_40(ip, 6, op, parm);\ + BITBLK128V32_40(ip, 7, op, parm); IPPE(ip); OPPE(op += 40*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_41(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 29);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 1)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 31);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+31, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_41(ip, op, parm) {\ + BITBLK128V32_41(ip, 0, op, parm); IPPE(ip); OPPE(op += 41*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_42(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*16+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ + VSTI(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ + VSTI(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ + VSTI(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+10, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ + VSTI(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+11, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ + VSTI(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+12, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+13, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ + VSTI(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+14, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+15, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_42(ip, op, parm) {\ + BITBLK128V32_42(ip, 0, op, parm);\ + BITBLK128V32_42(ip, 1, op, parm); IPPE(ip); OPPE(op += 42*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_43(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 1)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 31);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 29);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+31, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_43(ip, op, parm) {\ + BITBLK128V32_43(ip, 0, op, parm); IPPE(ip); OPPE(op += 43*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_44(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*8+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ + VSTI(ip, i*8+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 1, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*8+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 2, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*8+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 3, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ + VSTI(ip, i*8+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 4, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*8+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 5, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*8+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 6, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*8+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 7, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_44(ip, op, parm) {\ + BITBLK128V32_44(ip, 0, op, parm);\ + BITBLK128V32_44(ip, 1, op, parm);\ + BITBLK128V32_44(ip, 2, op, parm);\ + BITBLK128V32_44(ip, 3, op, parm); IPPE(ip); OPPE(op += 44*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_45(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 1)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 31);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 29);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+31, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_45(ip, op, parm) {\ + BITBLK128V32_45(ip, 0, op, parm); IPPE(ip); OPPE(op += 45*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_46(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*16+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ + VSTI(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ + VSTI(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ + VSTI(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ + VSTI(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ + VSTI(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+10, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+11, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+12, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+13, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+14, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ + VSTI(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+15, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_46(ip, op, parm) {\ + BITBLK128V32_46(ip, 0, op, parm);\ + BITBLK128V32_46(ip, 1, op, parm); IPPE(ip); OPPE(op += 46*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_47(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 29);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 1)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 31);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+31, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_47(ip, op, parm) {\ + BITBLK128V32_47(ip, 0, op, parm); IPPE(ip); OPPE(op += 47*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_48(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*2+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*2+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ + VSTI(ip, i*2+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*2+ 1, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_48(ip, op, parm) {\ + BITBLK128V32_48(ip, 0, op, parm);\ + BITBLK128V32_48(ip, 1, op, parm);\ + BITBLK128V32_48(ip, 2, op, parm);\ + BITBLK128V32_48(ip, 3, op, parm);\ + BITBLK128V32_48(ip, 4, op, parm);\ + BITBLK128V32_48(ip, 5, op, parm);\ + BITBLK128V32_48(ip, 6, op, parm);\ + BITBLK128V32_48(ip, 7, op, parm);\ + BITBLK128V32_48(ip, 8, op, parm);\ + BITBLK128V32_48(ip, 9, op, parm);\ + BITBLK128V32_48(ip, 10, op, parm);\ + BITBLK128V32_48(ip, 11, op, parm);\ + BITBLK128V32_48(ip, 12, op, parm);\ + BITBLK128V32_48(ip, 13, op, parm);\ + BITBLK128V32_48(ip, 14, op, parm);\ + BITBLK128V32_48(ip, 15, op, parm); IPPE(ip); OPPE(op += 48*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_49(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 1)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 31);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 29);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+31, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_49(ip, op, parm) {\ + BITBLK128V32_49(ip, 0, op, parm); IPPE(ip); OPPE(op += 49*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_50(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*16+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ + VSTI(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ + VSTI(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ + VSTI(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+10, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+11, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ + VSTI(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+12, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+13, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ + VSTI(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+14, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+15, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_50(ip, op, parm) {\ + BITBLK128V32_50(ip, 0, op, parm);\ + BITBLK128V32_50(ip, 1, op, parm); IPPE(ip); OPPE(op += 50*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_51(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 29);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 1)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 31);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+31, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_51(ip, op, parm) {\ + BITBLK128V32_51(ip, 0, op, parm); IPPE(ip); OPPE(op += 51*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_52(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*8+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ + VSTI(ip, i*8+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 1, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*8+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 2, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*8+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 3, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*8+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 4, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*8+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 5, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ + VSTI(ip, i*8+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 6, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*8+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 7, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_52(ip, op, parm) {\ + BITBLK128V32_52(ip, 0, op, parm);\ + BITBLK128V32_52(ip, 1, op, parm);\ + BITBLK128V32_52(ip, 2, op, parm);\ + BITBLK128V32_52(ip, 3, op, parm); IPPE(ip); OPPE(op += 52*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_53(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 29);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 1)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 31);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+31, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_53(ip, op, parm) {\ + BITBLK128V32_53(ip, 0, op, parm); IPPE(ip); OPPE(op += 53*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_54(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*16+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ + VSTI(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ + VSTI(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ + VSTI(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ + VSTI(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ + VSTI(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+10, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+11, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+12, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+13, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+14, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+15, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_54(ip, op, parm) {\ + BITBLK128V32_54(ip, 0, op, parm);\ + BITBLK128V32_54(ip, 1, op, parm); IPPE(ip); OPPE(op += 54*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_55(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 1)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 31);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 29);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+31, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_55(ip, op, parm) {\ + BITBLK128V32_55(ip, 0, op, parm); IPPE(ip); OPPE(op += 55*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_56(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*4+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*4+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ + VSTI(ip, i*4+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*4+ 1, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*4+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*4+ 2, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*4+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*4+ 3, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_56(ip, op, parm) {\ + BITBLK128V32_56(ip, 0, op, parm);\ + BITBLK128V32_56(ip, 1, op, parm);\ + BITBLK128V32_56(ip, 2, op, parm);\ + BITBLK128V32_56(ip, 3, op, parm);\ + BITBLK128V32_56(ip, 4, op, parm);\ + BITBLK128V32_56(ip, 5, op, parm);\ + BITBLK128V32_56(ip, 6, op, parm);\ + BITBLK128V32_56(ip, 7, op, parm); IPPE(ip); OPPE(op += 56*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_57(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 1)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 31);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 29);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+31, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_57(ip, op, parm) {\ + BITBLK128V32_57(ip, 0, op, parm); IPPE(ip); OPPE(op += 57*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_58(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*16+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ + VSTI(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ + VSTI(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ + VSTI(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ + VSTI(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+10, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ + VSTI(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+11, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+12, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+13, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+14, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+15, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_58(ip, op, parm) {\ + BITBLK128V32_58(ip, 0, op, parm);\ + BITBLK128V32_58(ip, 1, op, parm); IPPE(ip); OPPE(op += 58*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_59(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 1)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 31);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 29);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+31, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_59(ip, op, parm) {\ + BITBLK128V32_59(ip, 0, op, parm); IPPE(ip); OPPE(op += 59*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_60(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*8+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ + VSTI(ip, i*8+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 1, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*8+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 2, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*8+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 3, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*8+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 4, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*8+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 5, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*8+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 6, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*8+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 7, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_60(ip, op, parm) {\ + BITBLK128V32_60(ip, 0, op, parm);\ + BITBLK128V32_60(ip, 1, op, parm);\ + BITBLK128V32_60(ip, 2, op, parm);\ + BITBLK128V32_60(ip, 3, op, parm); IPPE(ip); OPPE(op += 60*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_61(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 1)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 31);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+31, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 29); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_61(ip, op, parm) {\ + BITBLK128V32_61(ip, 0, op, parm); IPPE(ip); OPPE(op += 61*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_62(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*16+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ + VSTI(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ + VSTI(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+10, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+11, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ + VSTI(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+12, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+13, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ + VSTI(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+14, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ + VSTI(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+15, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_62(ip, op, parm) {\ + BITBLK128V32_62(ip, 0, op, parm);\ + BITBLK128V32_62(ip, 1, op, parm); IPPE(ip); OPPE(op += 62*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_63(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 29);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+31, iv), 1)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 31); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_63(ip, op, parm) {\ + BITBLK128V32_63(ip, 0, op, parm); IPPE(ip); OPPE(op += 63*4/sizeof(op[0]));\ +} + +#define BITBLK128V32_64(ip, i, op, parm) { __m128i ov,iv;\ + VSTI(ip, i*1+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*1+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32); _mm_storeu_si128((__m128i *)op++, ov);\ +} + +#define BITPACK128V32_64(ip, op, parm) {\ + BITBLK128V32_64(ip, 0, op, parm);\ + BITBLK128V32_64(ip, 1, op, parm);\ + BITBLK128V32_64(ip, 2, op, parm);\ + BITBLK128V32_64(ip, 3, op, parm);\ + BITBLK128V32_64(ip, 4, op, parm);\ + BITBLK128V32_64(ip, 5, op, parm);\ + BITBLK128V32_64(ip, 6, op, parm);\ + BITBLK128V32_64(ip, 7, op, parm);\ + BITBLK128V32_64(ip, 8, op, parm);\ + BITBLK128V32_64(ip, 9, op, parm);\ + BITBLK128V32_64(ip, 10, op, parm);\ + BITBLK128V32_64(ip, 11, op, parm);\ + BITBLK128V32_64(ip, 12, op, parm);\ + BITBLK128V32_64(ip, 13, op, parm);\ + BITBLK128V32_64(ip, 14, op, parm);\ + BITBLK128V32_64(ip, 15, op, parm);\ + BITBLK128V32_64(ip, 16, op, parm);\ + BITBLK128V32_64(ip, 17, op, parm);\ + BITBLK128V32_64(ip, 18, op, parm);\ + BITBLK128V32_64(ip, 19, op, parm);\ + BITBLK128V32_64(ip, 20, op, parm);\ + BITBLK128V32_64(ip, 21, op, parm);\ + BITBLK128V32_64(ip, 22, op, parm);\ + BITBLK128V32_64(ip, 23, op, parm);\ + BITBLK128V32_64(ip, 24, op, parm);\ + BITBLK128V32_64(ip, 25, op, parm);\ + BITBLK128V32_64(ip, 26, op, parm);\ + BITBLK128V32_64(ip, 27, op, parm);\ + BITBLK128V32_64(ip, 28, op, parm);\ + BITBLK128V32_64(ip, 29, op, parm);\ + BITBLK128V32_64(ip, 30, op, parm);\ + BITBLK128V32_64(ip, 31, op, parm); IPPE(ip); OPPE(op += 64*4/sizeof(op[0]));\ +} + diff --git a/bitpack256v.c b/bitpack256v.c new file mode 100644 index 0000000..7944484 --- /dev/null +++ b/bitpack256v.c @@ -0,0 +1,119 @@ +/** + Copyright (C) powturbo 2013-2017 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - homepage : https://sites.google.com/site/powturbo/ + - github : https://github.com/powturbo + - twitter : https://twitter.com/powturbo + - email : powturbo [_AT_] gmail [_DOT_] com +**/ +// "Integer Compression" SIMD bit packing + #ifndef VSTI +#include +#include "bitpack.h" +#include "bitutil.h" + +#define PAD8(__x) (((__x)+8-1)/8) + +#define VSTI(ip, i, iv, parm) +#define IPP(ip, i, iv) _mm256_loadu_si256(ip++) +#include __FILE__ + +unsigned char *bitpack256v32(unsigned *__restrict in, unsigned char *__restrict out, unsigned b) { unsigned char *pout = out+PAD8(256*b); BITPACK256V32(in, b, out, 0); return pout; } +#undef VSTI +#undef IPP + +//------------------------------------------------------------------------------------------------------------------------------ +#if 0 +#define VSTI(__ip, __i, __iv, __sv) v = _mm256_loadu_si256(__ip++); DELTA256x32(v,__sv, __iv) //__sv = v +#define IPP(ip, i, __iv) __iv +#include __FILE__ + +unsigned char *bitdpack256v32(unsigned *__restrict in, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(256*b); + __m256i v; //,sv = _mm256_set1_epi32(start),zv = _mm256_setzero_si256(); + __m128i sv = _mm_set1_epi32(start); + BITPACK256V32(in, b, out, sv); + return pout; +} +#undef VSTI + +//------------------------------------------------------------------------------------------------------------------------------ +#define VSTI(__ip, __i, __iv, __sv) v = _mm256_loadu_si256(__ip++); __iv = _mm256_sub_epi32(DELTA256x32(v,__sv),cv); __sv = v + +unsigned char *bitd1pack256v32(unsigned *__restrict in, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(256*b); + __m256i v, sv = _mm256_set1_epi32(start), cv = _mm256_set1_epi32(1); + //BITPACK256V32(in, b, out, sv); return pout; +} +#undef VSTI +//------------------------------------------------------------------------------------------------------------------------------ +#define VSTI(__ip, __i, __iv, __sv) v = _mm256_loadu_si256(__ip++); __iv = DELTA256x32(v,__sv); __sv = v; __iv = ZIGZAG256x32(__iv) + +unsigned char *bitzpack256v32(unsigned *__restrict in, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(256*b); + __m256i v, sv = _mm256_set1_epi32(start), cv = _mm256_set1_epi32(1); + //BITPACK256V32(in, b, out, sv); + return pout; +} +#endif + +#undef VSTI + #else +#include +#include + +#define OPPE(__op) +#define IPPE(__op) + +#include "bitpack256v_.h" + +#define BITPACK256V32(__pip, __nbits, __pop, __parm) { __m256i *__ip=(__m256i *)__pip,*__op=(__m256i *)__pop;\ + switch(__nbits) {\ + case 0: break;\ + case 1:{ BITPACK256V32_1( __ip, __op, __parm); } break;\ + case 2:{ BITPACK256V32_2( __ip, __op, __parm); } break;\ + case 3:{ BITPACK256V32_3( __ip, __op, __parm); } break;\ + case 4:{ BITPACK256V32_4( __ip, __op, __parm); } break;\ + case 5:{ BITPACK256V32_5( __ip, __op, __parm); } break;\ + case 6:{ BITPACK256V32_6( __ip, __op, __parm); } break;\ + case 7:{ BITPACK256V32_7( __ip, __op, __parm); } break;\ + case 8:{ BITPACK256V32_8( __ip, __op, __parm); } break;\ + case 9:{ BITPACK256V32_9( __ip, __op, __parm); } break;\ + case 10:{ BITPACK256V32_10(__ip, __op, __parm); } break;\ + case 11:{ BITPACK256V32_11(__ip, __op, __parm); } break;\ + case 12:{ BITPACK256V32_12(__ip, __op, __parm); } break;\ + case 13:{ BITPACK256V32_13(__ip, __op, __parm); } break;\ + case 14:{ BITPACK256V32_14(__ip, __op, __parm); } break;\ + case 15:{ BITPACK256V32_15(__ip, __op, __parm); } break;\ + case 16:{ BITPACK256V32_16(__ip, __op, __parm); } break;\ + case 17:{ BITPACK256V32_17(__ip, __op, __parm); } break;\ + case 18:{ BITPACK256V32_18(__ip, __op, __parm); } break;\ + case 19:{ BITPACK256V32_19(__ip, __op, __parm); } break;\ + case 20:{ BITPACK256V32_20(__ip, __op, __parm); } break;\ + case 21:{ BITPACK256V32_21(__ip, __op, __parm); } break;\ + case 22:{ BITPACK256V32_22(__ip, __op, __parm); } break;\ + case 23:{ BITPACK256V32_23(__ip, __op, __parm); } break;\ + case 24:{ BITPACK256V32_24(__ip, __op, __parm); } break;\ + case 25:{ BITPACK256V32_25(__ip, __op, __parm); } break;\ + case 26:{ BITPACK256V32_26(__ip, __op, __parm); } break;\ + case 27:{ BITPACK256V32_27(__ip, __op, __parm); } break;\ + case 28:{ BITPACK256V32_28(__ip, __op, __parm); } break;\ + case 29:{ BITPACK256V32_29(__ip, __op, __parm); } break;\ + case 30:{ BITPACK256V32_30(__ip, __op, __parm); } break;\ + case 31:{ BITPACK256V32_31(__ip, __op, __parm); } break;\ + case 32:{ BITPACK256V32_32(__ip, __op, __parm); } break;\ + }\ +} + #endif diff --git a/bitpack256v_.h b/bitpack256v_.h new file mode 100644 index 0000000..7fb31e7 --- /dev/null +++ b/bitpack256v_.h @@ -0,0 +1,1998 @@ +/** + Copyright (C) powturbo 2013-2017 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - homepage : https://sites.google.com/site/powturbo/ + - github : https://github.com/powturbo + - twitter : https://twitter.com/powturbo + - email : powturbo [_AT_] gmail [_DOT_] com +**/ +// TurboPFor: Integer Compression SIMD bit packing +#define BITBLK256V32_1(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 1, iv), 1));\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 2, iv), 2));\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 3, iv), 3));\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 4, iv), 4));\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 5, iv), 5));\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 6, iv), 6));\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 7, iv), 7));\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 8, iv), 8));\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 9, iv), 9));\ + VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+10, iv), 10));\ + VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+11, iv), 11));\ + VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+12, iv), 12));\ + VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+13, iv), 13));\ + VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+14, iv), 14));\ + VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+15, iv), 15));\ + VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+16, iv), 16));\ + VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+17, iv), 17));\ + VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+18, iv), 18));\ + VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+19, iv), 19));\ + VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+20, iv), 20));\ + VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+21, iv), 21));\ + VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+22, iv), 22));\ + VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+23, iv), 23));\ + VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+24, iv), 24));\ + VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+25, iv), 25));\ + VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+26, iv), 26));\ + VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+27, iv), 27));\ + VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+28, iv), 28));\ + VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+29, iv), 29));\ + VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+30, iv), 30));\ + VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+31, iv), 31)); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_1(ip, op, parm) {\ + BITBLK256V32_1(ip, 0, op, parm); IPPE(ip); OPPE(op += 1*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_2(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*16+ 0, iv, parm); ov = IPP(ip, i*16+ 0, iv);\ + VSTI(ip, i*16+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 1, iv), 2));\ + VSTI(ip, i*16+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 2, iv), 4));\ + VSTI(ip, i*16+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 3, iv), 6));\ + VSTI(ip, i*16+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 4, iv), 8));\ + VSTI(ip, i*16+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 5, iv), 10));\ + VSTI(ip, i*16+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 6, iv), 12));\ + VSTI(ip, i*16+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 7, iv), 14));\ + VSTI(ip, i*16+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 8, iv), 16));\ + VSTI(ip, i*16+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 9, iv), 18));\ + VSTI(ip, i*16+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+10, iv), 20));\ + VSTI(ip, i*16+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+11, iv), 22));\ + VSTI(ip, i*16+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+12, iv), 24));\ + VSTI(ip, i*16+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+13, iv), 26));\ + VSTI(ip, i*16+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+14, iv), 28));\ + VSTI(ip, i*16+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+15, iv), 30)); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_2(ip, op, parm) {\ + BITBLK256V32_2(ip, 0, op, parm);\ + BITBLK256V32_2(ip, 1, op, parm); IPPE(ip); OPPE(op += 2*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_3(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 1, iv), 3));\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 2, iv), 6));\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 3, iv), 9));\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 4, iv), 12));\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 5, iv), 15));\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 6, iv), 18));\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 7, iv), 21));\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 8, iv), 24));\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 9, iv), 27));\ + VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+11, iv), 1));\ + VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+12, iv), 4));\ + VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+13, iv), 7));\ + VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+14, iv), 10));\ + VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+15, iv), 13));\ + VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+16, iv), 16));\ + VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+17, iv), 19));\ + VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+18, iv), 22));\ + VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+19, iv), 25));\ + VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+20, iv), 28));\ + VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+22, iv), 2));\ + VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+23, iv), 5));\ + VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+24, iv), 8));\ + VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+25, iv), 11));\ + VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+26, iv), 14));\ + VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+27, iv), 17));\ + VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+28, iv), 20));\ + VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+29, iv), 23));\ + VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+30, iv), 26));\ + VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+31, iv), 29)); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_3(ip, op, parm) {\ + BITBLK256V32_3(ip, 0, op, parm); IPPE(ip); OPPE(op += 3*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_4(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*8+ 0, iv, parm); ov = IPP(ip, i*8+ 0, iv);\ + VSTI(ip, i*8+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*8+ 1, iv), 4));\ + VSTI(ip, i*8+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*8+ 2, iv), 8));\ + VSTI(ip, i*8+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*8+ 3, iv), 12));\ + VSTI(ip, i*8+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*8+ 4, iv), 16));\ + VSTI(ip, i*8+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*8+ 5, iv), 20));\ + VSTI(ip, i*8+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*8+ 6, iv), 24));\ + VSTI(ip, i*8+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*8+ 7, iv), 28)); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_4(ip, op, parm) {\ + BITBLK256V32_4(ip, 0, op, parm);\ + BITBLK256V32_4(ip, 1, op, parm);\ + BITBLK256V32_4(ip, 2, op, parm);\ + BITBLK256V32_4(ip, 3, op, parm); IPPE(ip); OPPE(op += 4*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_5(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 1, iv), 5));\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 2, iv), 10));\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 3, iv), 15));\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 4, iv), 20));\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 5, iv), 25));\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 7, iv), 3));\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 8, iv), 8));\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 9, iv), 13));\ + VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+10, iv), 18));\ + VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+11, iv), 23));\ + VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+13, iv), 1));\ + VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+14, iv), 6));\ + VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+15, iv), 11));\ + VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+16, iv), 16));\ + VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+17, iv), 21));\ + VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+18, iv), 26));\ + VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+20, iv), 4));\ + VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+21, iv), 9));\ + VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+22, iv), 14));\ + VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+23, iv), 19));\ + VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+24, iv), 24));\ + VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+26, iv), 2));\ + VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+27, iv), 7));\ + VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+28, iv), 12));\ + VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+29, iv), 17));\ + VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+30, iv), 22));\ + VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+31, iv), 27)); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_5(ip, op, parm) {\ + BITBLK256V32_5(ip, 0, op, parm); IPPE(ip); OPPE(op += 5*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_6(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*16+ 0, iv, parm); ov = IPP(ip, i*16+ 0, iv);\ + VSTI(ip, i*16+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 1, iv), 6));\ + VSTI(ip, i*16+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 2, iv), 12));\ + VSTI(ip, i*16+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 3, iv), 18));\ + VSTI(ip, i*16+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 4, iv), 24));\ + VSTI(ip, i*16+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*16+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 6, iv), 4));\ + VSTI(ip, i*16+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 7, iv), 10));\ + VSTI(ip, i*16+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 8, iv), 16));\ + VSTI(ip, i*16+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 9, iv), 22));\ + VSTI(ip, i*16+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+10, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*16+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+11, iv), 2));\ + VSTI(ip, i*16+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+12, iv), 8));\ + VSTI(ip, i*16+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+13, iv), 14));\ + VSTI(ip, i*16+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+14, iv), 20));\ + VSTI(ip, i*16+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+15, iv), 26)); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_6(ip, op, parm) {\ + BITBLK256V32_6(ip, 0, op, parm);\ + BITBLK256V32_6(ip, 1, op, parm); IPPE(ip); OPPE(op += 6*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_7(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 1, iv), 7));\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 2, iv), 14));\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 3, iv), 21));\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 5, iv), 3));\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 6, iv), 10));\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 7, iv), 17));\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 8, iv), 24));\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+10, iv), 6));\ + VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+11, iv), 13));\ + VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+12, iv), 20));\ + VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+14, iv), 2));\ + VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+15, iv), 9));\ + VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+16, iv), 16));\ + VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+17, iv), 23));\ + VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+19, iv), 5));\ + VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+20, iv), 12));\ + VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+21, iv), 19));\ + VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+23, iv), 1));\ + VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+24, iv), 8));\ + VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+25, iv), 15));\ + VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+26, iv), 22));\ + VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+28, iv), 4));\ + VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+29, iv), 11));\ + VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+30, iv), 18));\ + VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+31, iv), 25)); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_7(ip, op, parm) {\ + BITBLK256V32_7(ip, 0, op, parm); IPPE(ip); OPPE(op += 7*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_8(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*4+ 0, iv, parm); ov = IPP(ip, i*4+ 0, iv);\ + VSTI(ip, i*4+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*4+ 1, iv), 8));\ + VSTI(ip, i*4+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*4+ 2, iv), 16));\ + VSTI(ip, i*4+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*4+ 3, iv), 24)); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_8(ip, op, parm) {\ + BITBLK256V32_8(ip, 0, op, parm);\ + BITBLK256V32_8(ip, 1, op, parm);\ + BITBLK256V32_8(ip, 2, op, parm);\ + BITBLK256V32_8(ip, 3, op, parm);\ + BITBLK256V32_8(ip, 4, op, parm);\ + BITBLK256V32_8(ip, 5, op, parm);\ + BITBLK256V32_8(ip, 6, op, parm);\ + BITBLK256V32_8(ip, 7, op, parm); IPPE(ip); OPPE(op += 8*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_9(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 1, iv), 9));\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 2, iv), 18));\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 4, iv), 4));\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 5, iv), 13));\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 6, iv), 22));\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 8, iv), 8));\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 9, iv), 17));\ + VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+11, iv), 3));\ + VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+12, iv), 12));\ + VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+13, iv), 21));\ + VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+15, iv), 7));\ + VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+16, iv), 16));\ + VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+18, iv), 2));\ + VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+19, iv), 11));\ + VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+20, iv), 20));\ + VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+22, iv), 6));\ + VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+23, iv), 15));\ + VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+25, iv), 1));\ + VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+26, iv), 10));\ + VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+27, iv), 19));\ + VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+29, iv), 5));\ + VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+30, iv), 14));\ + VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+31, iv), 23)); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_9(ip, op, parm) {\ + BITBLK256V32_9(ip, 0, op, parm); IPPE(ip); OPPE(op += 9*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_10(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*16+ 0, iv, parm); ov = IPP(ip, i*16+ 0, iv);\ + VSTI(ip, i*16+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 1, iv), 10));\ + VSTI(ip, i*16+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 2, iv), 20));\ + VSTI(ip, i*16+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*16+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 4, iv), 8));\ + VSTI(ip, i*16+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 5, iv), 18));\ + VSTI(ip, i*16+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*16+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 7, iv), 6));\ + VSTI(ip, i*16+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 8, iv), 16));\ + VSTI(ip, i*16+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*16+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+10, iv), 4));\ + VSTI(ip, i*16+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+11, iv), 14));\ + VSTI(ip, i*16+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+12, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*16+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+13, iv), 2));\ + VSTI(ip, i*16+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+14, iv), 12));\ + VSTI(ip, i*16+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+15, iv), 22)); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_10(ip, op, parm) {\ + BITBLK256V32_10(ip, 0, op, parm);\ + BITBLK256V32_10(ip, 1, op, parm); IPPE(ip); OPPE(op += 10*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_11(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 1, iv), 11));\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 3, iv), 1));\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 4, iv), 12));\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 6, iv), 2));\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 7, iv), 13));\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 9, iv), 3));\ + VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+10, iv), 14));\ + VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+12, iv), 4));\ + VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+13, iv), 15));\ + VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+15, iv), 5));\ + VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+16, iv), 16));\ + VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+18, iv), 6));\ + VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+19, iv), 17));\ + VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+21, iv), 7));\ + VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+22, iv), 18));\ + VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+24, iv), 8));\ + VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+25, iv), 19));\ + VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+27, iv), 9));\ + VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+28, iv), 20));\ + VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+30, iv), 10));\ + VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+31, iv), 21)); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_11(ip, op, parm) {\ + BITBLK256V32_11(ip, 0, op, parm); IPPE(ip); OPPE(op += 11*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_12(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*8+ 0, iv, parm); ov = IPP(ip, i*8+ 0, iv);\ + VSTI(ip, i*8+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*8+ 1, iv), 12));\ + VSTI(ip, i*8+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 2, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*8+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*8+ 3, iv), 4));\ + VSTI(ip, i*8+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*8+ 4, iv), 16));\ + VSTI(ip, i*8+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 5, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*8+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*8+ 6, iv), 8));\ + VSTI(ip, i*8+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*8+ 7, iv), 20)); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_12(ip, op, parm) {\ + BITBLK256V32_12(ip, 0, op, parm);\ + BITBLK256V32_12(ip, 1, op, parm);\ + BITBLK256V32_12(ip, 2, op, parm);\ + BITBLK256V32_12(ip, 3, op, parm); IPPE(ip); OPPE(op += 12*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_13(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 1, iv), 13));\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 3, iv), 7));\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 5, iv), 1));\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 6, iv), 14));\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 8, iv), 8));\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+10, iv), 2));\ + VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+11, iv), 15));\ + VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+13, iv), 9));\ + VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+15, iv), 3));\ + VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+16, iv), 16));\ + VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+18, iv), 10));\ + VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+20, iv), 4));\ + VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+21, iv), 17));\ + VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+23, iv), 11));\ + VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+25, iv), 5));\ + VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+26, iv), 18));\ + VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+28, iv), 12));\ + VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+30, iv), 6));\ + VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+31, iv), 19)); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_13(ip, op, parm) {\ + BITBLK256V32_13(ip, 0, op, parm); IPPE(ip); OPPE(op += 13*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_14(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*16+ 0, iv, parm); ov = IPP(ip, i*16+ 0, iv);\ + VSTI(ip, i*16+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 1, iv), 14));\ + VSTI(ip, i*16+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*16+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 3, iv), 10));\ + VSTI(ip, i*16+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*16+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 5, iv), 6));\ + VSTI(ip, i*16+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*16+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 7, iv), 2));\ + VSTI(ip, i*16+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 8, iv), 16));\ + VSTI(ip, i*16+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*16+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+10, iv), 12));\ + VSTI(ip, i*16+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+11, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*16+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+12, iv), 8));\ + VSTI(ip, i*16+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+13, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*16+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+14, iv), 4));\ + VSTI(ip, i*16+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+15, iv), 18)); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_14(ip, op, parm) {\ + BITBLK256V32_14(ip, 0, op, parm);\ + BITBLK256V32_14(ip, 1, op, parm); IPPE(ip); OPPE(op += 14*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_15(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 1, iv), 15));\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 3, iv), 13));\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 5, iv), 11));\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 7, iv), 9));\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 9, iv), 7));\ + VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+11, iv), 5));\ + VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+13, iv), 3));\ + VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+15, iv), 1));\ + VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+16, iv), 16));\ + VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+18, iv), 14));\ + VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+20, iv), 12));\ + VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+22, iv), 10));\ + VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+24, iv), 8));\ + VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+26, iv), 6));\ + VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+28, iv), 4));\ + VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+30, iv), 2));\ + VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+31, iv), 17)); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_15(ip, op, parm) {\ + BITBLK256V32_15(ip, 0, op, parm); IPPE(ip); OPPE(op += 15*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_16(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*2+ 0, iv, parm); ov = IPP(ip, i*2+ 0, iv);\ + VSTI(ip, i*2+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*2+ 1, iv), 16)); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_16(ip, op, parm) {\ + BITBLK256V32_16(ip, 0, op, parm);\ + BITBLK256V32_16(ip, 1, op, parm);\ + BITBLK256V32_16(ip, 2, op, parm);\ + BITBLK256V32_16(ip, 3, op, parm);\ + BITBLK256V32_16(ip, 4, op, parm);\ + BITBLK256V32_16(ip, 5, op, parm);\ + BITBLK256V32_16(ip, 6, op, parm);\ + BITBLK256V32_16(ip, 7, op, parm);\ + BITBLK256V32_16(ip, 8, op, parm);\ + BITBLK256V32_16(ip, 9, op, parm);\ + BITBLK256V32_16(ip, 10, op, parm);\ + BITBLK256V32_16(ip, 11, op, parm);\ + BITBLK256V32_16(ip, 12, op, parm);\ + BITBLK256V32_16(ip, 13, op, parm);\ + BITBLK256V32_16(ip, 14, op, parm);\ + BITBLK256V32_16(ip, 15, op, parm); IPPE(ip); OPPE(op += 16*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_17(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 2, iv), 2));\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 4, iv), 4));\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 6, iv), 6));\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 8, iv), 8));\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+10, iv), 10));\ + VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+12, iv), 12));\ + VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+14, iv), 14));\ + VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+17, iv), 1));\ + VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+19, iv), 3));\ + VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+21, iv), 5));\ + VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+23, iv), 7));\ + VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+25, iv), 9));\ + VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+27, iv), 11));\ + VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+29, iv), 13));\ + VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+31, iv), 15)); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_17(ip, op, parm) {\ + BITBLK256V32_17(ip, 0, op, parm); IPPE(ip); OPPE(op += 17*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_18(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*16+ 0, iv, parm); ov = IPP(ip, i*16+ 0, iv);\ + VSTI(ip, i*16+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*16+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 2, iv), 4));\ + VSTI(ip, i*16+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*16+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 4, iv), 8));\ + VSTI(ip, i*16+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*16+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 6, iv), 12));\ + VSTI(ip, i*16+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*16+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*16+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 9, iv), 2));\ + VSTI(ip, i*16+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+10, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*16+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+11, iv), 6));\ + VSTI(ip, i*16+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+12, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*16+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+13, iv), 10));\ + VSTI(ip, i*16+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+14, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*16+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+15, iv), 14)); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_18(ip, op, parm) {\ + BITBLK256V32_18(ip, 0, op, parm);\ + BITBLK256V32_18(ip, 1, op, parm); IPPE(ip); OPPE(op += 18*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_19(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 2, iv), 6));\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 4, iv), 12));\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 7, iv), 5));\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 9, iv), 11));\ + VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+12, iv), 4));\ + VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+14, iv), 10));\ + VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+17, iv), 3));\ + VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+19, iv), 9));\ + VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+22, iv), 2));\ + VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+24, iv), 8));\ + VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+27, iv), 1));\ + VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+29, iv), 7));\ + VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+31, iv), 13)); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_19(ip, op, parm) {\ + BITBLK256V32_19(ip, 0, op, parm); IPPE(ip); OPPE(op += 19*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_20(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*8+ 0, iv, parm); ov = IPP(ip, i*8+ 0, iv);\ + VSTI(ip, i*8+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 1, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*8+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*8+ 2, iv), 8));\ + VSTI(ip, i*8+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 3, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*8+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 4, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*8+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*8+ 5, iv), 4));\ + VSTI(ip, i*8+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 6, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*8+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*8+ 7, iv), 12)); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_20(ip, op, parm) {\ + BITBLK256V32_20(ip, 0, op, parm);\ + BITBLK256V32_20(ip, 1, op, parm);\ + BITBLK256V32_20(ip, 2, op, parm);\ + BITBLK256V32_20(ip, 3, op, parm); IPPE(ip); OPPE(op += 20*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_21(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 2, iv), 10));\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 5, iv), 9));\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 8, iv), 8));\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+11, iv), 7));\ + VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+14, iv), 6));\ + VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+17, iv), 5));\ + VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+20, iv), 4));\ + VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+23, iv), 3));\ + VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+26, iv), 2));\ + VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+29, iv), 1));\ + VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+31, iv), 11)); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_21(ip, op, parm) {\ + BITBLK256V32_21(ip, 0, op, parm); IPPE(ip); OPPE(op += 21*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_22(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*16+ 0, iv, parm); ov = IPP(ip, i*16+ 0, iv);\ + VSTI(ip, i*16+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*16+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*16+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 3, iv), 2));\ + VSTI(ip, i*16+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*16+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ + VSTI(ip, i*16+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 6, iv), 4));\ + VSTI(ip, i*16+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*16+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*16+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 9, iv), 6));\ + VSTI(ip, i*16+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+10, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*16+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+11, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*16+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+12, iv), 8));\ + VSTI(ip, i*16+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+13, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*16+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+14, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*16+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+15, iv), 10)); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_22(ip, op, parm) {\ + BITBLK256V32_22(ip, 0, op, parm);\ + BITBLK256V32_22(ip, 1, op, parm); IPPE(ip); OPPE(op += 22*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_23(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 3, iv), 5));\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 7, iv), 1));\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+10, iv), 6));\ + VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+14, iv), 2));\ + VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+17, iv), 7));\ + VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+21, iv), 3));\ + VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+24, iv), 8));\ + VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+28, iv), 4));\ + VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+31, iv), 9)); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_23(ip, op, parm) {\ + BITBLK256V32_23(ip, 0, op, parm); IPPE(ip); OPPE(op += 23*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_24(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*4+ 0, iv, parm); ov = IPP(ip, i*4+ 0, iv);\ + VSTI(ip, i*4+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*4+ 1, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*4+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*4+ 2, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*4+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*4+ 3, iv), 8)); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_24(ip, op, parm) {\ + BITBLK256V32_24(ip, 0, op, parm);\ + BITBLK256V32_24(ip, 1, op, parm);\ + BITBLK256V32_24(ip, 2, op, parm);\ + BITBLK256V32_24(ip, 3, op, parm);\ + BITBLK256V32_24(ip, 4, op, parm);\ + BITBLK256V32_24(ip, 5, op, parm);\ + BITBLK256V32_24(ip, 6, op, parm);\ + BITBLK256V32_24(ip, 7, op, parm); IPPE(ip); OPPE(op += 24*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_25(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 4, iv), 4));\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 9, iv), 1));\ + VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+13, iv), 5));\ + VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+18, iv), 2));\ + VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+22, iv), 6));\ + VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+27, iv), 3));\ + VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+31, iv), 7)); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_25(ip, op, parm) {\ + BITBLK256V32_25(ip, 0, op, parm); IPPE(ip); OPPE(op += 25*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_26(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*16+ 0, iv, parm); ov = IPP(ip, i*16+ 0, iv);\ + VSTI(ip, i*16+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*16+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*16+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ + VSTI(ip, i*16+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*16+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 5, iv), 2));\ + VSTI(ip, i*16+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*16+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*16+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*16+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ + VSTI(ip, i*16+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+10, iv), 4));\ + VSTI(ip, i*16+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+11, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*16+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+12, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*16+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+13, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*16+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+14, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*16+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+15, iv), 6)); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_26(ip, op, parm) {\ + BITBLK256V32_26(ip, 0, op, parm);\ + BITBLK256V32_26(ip, 1, op, parm); IPPE(ip); OPPE(op += 26*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_27(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 6, iv), 2));\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+12, iv), 4));\ + VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+19, iv), 1));\ + VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+25, iv), 3));\ + VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+31, iv), 5)); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_27(ip, op, parm) {\ + BITBLK256V32_27(ip, 0, op, parm); IPPE(ip); OPPE(op += 27*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_28(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*8+ 0, iv, parm); ov = IPP(ip, i*8+ 0, iv);\ + VSTI(ip, i*8+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 1, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*8+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 2, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*8+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 3, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*8+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 4, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*8+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 5, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*8+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 6, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*8+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*8+ 7, iv), 4)); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_28(ip, op, parm) {\ + BITBLK256V32_28(ip, 0, op, parm);\ + BITBLK256V32_28(ip, 1, op, parm);\ + BITBLK256V32_28(ip, 2, op, parm);\ + BITBLK256V32_28(ip, 3, op, parm); IPPE(ip); OPPE(op += 28*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_29(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+10, iv), 2));\ + VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+21, iv), 1));\ + VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+31, iv), 3)); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_29(ip, op, parm) {\ + BITBLK256V32_29(ip, 0, op, parm); IPPE(ip); OPPE(op += 29*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_30(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*16+ 0, iv, parm); ov = IPP(ip, i*16+ 0, iv);\ + VSTI(ip, i*16+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*16+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*16+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*16+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*16+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*16+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*16+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*16+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*16+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ + VSTI(ip, i*16+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+10, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*16+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+11, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ + VSTI(ip, i*16+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+12, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*16+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+13, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ + VSTI(ip, i*16+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+14, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ + VSTI(ip, i*16+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+15, iv), 2)); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_30(ip, op, parm) {\ + BITBLK256V32_30(ip, 0, op, parm);\ + BITBLK256V32_30(ip, 1, op, parm); IPPE(ip); OPPE(op += 30*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_31(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 3)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 29);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+31, iv), 1)); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_31(ip, op, parm) {\ + BITBLK256V32_31(ip, 0, op, parm); IPPE(ip); OPPE(op += 31*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_32(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*1+ 0, iv, parm); ov = IPP(ip, i*1+ 0, iv); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_32(ip, op, parm) {\ + BITBLK256V32_32(ip, 0, op, parm);\ + BITBLK256V32_32(ip, 1, op, parm);\ + BITBLK256V32_32(ip, 2, op, parm);\ + BITBLK256V32_32(ip, 3, op, parm);\ + BITBLK256V32_32(ip, 4, op, parm);\ + BITBLK256V32_32(ip, 5, op, parm);\ + BITBLK256V32_32(ip, 6, op, parm);\ + BITBLK256V32_32(ip, 7, op, parm);\ + BITBLK256V32_32(ip, 8, op, parm);\ + BITBLK256V32_32(ip, 9, op, parm);\ + BITBLK256V32_32(ip, 10, op, parm);\ + BITBLK256V32_32(ip, 11, op, parm);\ + BITBLK256V32_32(ip, 12, op, parm);\ + BITBLK256V32_32(ip, 13, op, parm);\ + BITBLK256V32_32(ip, 14, op, parm);\ + BITBLK256V32_32(ip, 15, op, parm);\ + BITBLK256V32_32(ip, 16, op, parm);\ + BITBLK256V32_32(ip, 17, op, parm);\ + BITBLK256V32_32(ip, 18, op, parm);\ + BITBLK256V32_32(ip, 19, op, parm);\ + BITBLK256V32_32(ip, 20, op, parm);\ + BITBLK256V32_32(ip, 21, op, parm);\ + BITBLK256V32_32(ip, 22, op, parm);\ + BITBLK256V32_32(ip, 23, op, parm);\ + BITBLK256V32_32(ip, 24, op, parm);\ + BITBLK256V32_32(ip, 25, op, parm);\ + BITBLK256V32_32(ip, 26, op, parm);\ + BITBLK256V32_32(ip, 27, op, parm);\ + BITBLK256V32_32(ip, 28, op, parm);\ + BITBLK256V32_32(ip, 29, op, parm);\ + BITBLK256V32_32(ip, 30, op, parm);\ + BITBLK256V32_32(ip, 31, op, parm); IPPE(ip); OPPE(op += 32*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_33(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 1)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 31);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 3)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 29);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+31, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_33(ip, op, parm) {\ + BITBLK256V32_33(ip, 0, op, parm); IPPE(ip); OPPE(op += 33*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_34(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*16+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ + VSTI(ip, i*16+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ + VSTI(ip, i*16+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ + VSTI(ip, i*16+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ + VSTI(ip, i*16+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*16+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ + VSTI(ip, i*16+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*16+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ + VSTI(ip, i*16+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*16+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*16+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+10, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*16+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+11, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*16+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+12, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*16+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+13, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*16+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+14, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*16+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+15, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_34(ip, op, parm) {\ + BITBLK256V32_34(ip, 0, op, parm);\ + BITBLK256V32_34(ip, 1, op, parm); IPPE(ip); OPPE(op += 34*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_35(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 3)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 29);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 1)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 31);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+31, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_35(ip, op, parm) {\ + BITBLK256V32_35(ip, 0, op, parm); IPPE(ip); OPPE(op += 35*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_36(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*8+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ + VSTI(ip, i*8+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 1, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ + VSTI(ip, i*8+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 2, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*8+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 3, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*8+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 4, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*8+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 5, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*8+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 6, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*8+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 7, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_36(ip, op, parm) {\ + BITBLK256V32_36(ip, 0, op, parm);\ + BITBLK256V32_36(ip, 1, op, parm);\ + BITBLK256V32_36(ip, 2, op, parm);\ + BITBLK256V32_36(ip, 3, op, parm); IPPE(ip); OPPE(op += 36*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_37(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 3)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 29);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 1)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 31);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+31, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_37(ip, op, parm) {\ + BITBLK256V32_37(ip, 0, op, parm); IPPE(ip); OPPE(op += 37*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_38(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*16+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ + VSTI(ip, i*16+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ + VSTI(ip, i*16+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*16+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*16+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*16+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*16+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ + VSTI(ip, i*16+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ + VSTI(ip, i*16+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*16+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*16+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+10, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*16+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+11, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ + VSTI(ip, i*16+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+12, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*16+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+13, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ + VSTI(ip, i*16+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+14, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*16+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+15, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_38(ip, op, parm) {\ + BITBLK256V32_38(ip, 0, op, parm);\ + BITBLK256V32_38(ip, 1, op, parm); IPPE(ip); OPPE(op += 38*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_39(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 3)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 29);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 1)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 31);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+31, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_39(ip, op, parm) {\ + BITBLK256V32_39(ip, 0, op, parm); IPPE(ip); OPPE(op += 39*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_40(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*4+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*4+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ + VSTI(ip, i*4+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*4+ 1, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*4+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*4+ 2, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*4+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*4+ 3, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_40(ip, op, parm) {\ + BITBLK256V32_40(ip, 0, op, parm);\ + BITBLK256V32_40(ip, 1, op, parm);\ + BITBLK256V32_40(ip, 2, op, parm);\ + BITBLK256V32_40(ip, 3, op, parm);\ + BITBLK256V32_40(ip, 4, op, parm);\ + BITBLK256V32_40(ip, 5, op, parm);\ + BITBLK256V32_40(ip, 6, op, parm);\ + BITBLK256V32_40(ip, 7, op, parm); IPPE(ip); OPPE(op += 40*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_41(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 3)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 29);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 1)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 31);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+31, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_41(ip, op, parm) {\ + BITBLK256V32_41(ip, 0, op, parm); IPPE(ip); OPPE(op += 41*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_42(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*16+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ + VSTI(ip, i*16+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ + VSTI(ip, i*16+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*16+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*16+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*16+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*16+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*16+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ + VSTI(ip, i*16+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*16+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*16+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+10, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ + VSTI(ip, i*16+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+11, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ + VSTI(ip, i*16+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+12, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*16+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+13, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ + VSTI(ip, i*16+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+14, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*16+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+15, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_42(ip, op, parm) {\ + BITBLK256V32_42(ip, 0, op, parm);\ + BITBLK256V32_42(ip, 1, op, parm); IPPE(ip); OPPE(op += 42*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_43(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 1)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 31);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 3)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 29);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+31, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_43(ip, op, parm) {\ + BITBLK256V32_43(ip, 0, op, parm); IPPE(ip); OPPE(op += 43*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_44(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*8+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ + VSTI(ip, i*8+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 1, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*8+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 2, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*8+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 3, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ + VSTI(ip, i*8+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 4, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*8+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 5, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*8+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 6, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*8+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 7, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_44(ip, op, parm) {\ + BITBLK256V32_44(ip, 0, op, parm);\ + BITBLK256V32_44(ip, 1, op, parm);\ + BITBLK256V32_44(ip, 2, op, parm);\ + BITBLK256V32_44(ip, 3, op, parm); IPPE(ip); OPPE(op += 44*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_45(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 1)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 31);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 3)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 29);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+31, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_45(ip, op, parm) {\ + BITBLK256V32_45(ip, 0, op, parm); IPPE(ip); OPPE(op += 45*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_46(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*16+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ + VSTI(ip, i*16+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ + VSTI(ip, i*16+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*16+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ + VSTI(ip, i*16+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*16+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ + VSTI(ip, i*16+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*16+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ + VSTI(ip, i*16+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*16+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*16+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+10, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*16+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+11, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*16+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+12, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*16+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+13, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*16+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+14, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ + VSTI(ip, i*16+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+15, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_46(ip, op, parm) {\ + BITBLK256V32_46(ip, 0, op, parm);\ + BITBLK256V32_46(ip, 1, op, parm); IPPE(ip); OPPE(op += 46*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_47(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 3)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 29);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 1)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 31);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+31, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_47(ip, op, parm) {\ + BITBLK256V32_47(ip, 0, op, parm); IPPE(ip); OPPE(op += 47*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_48(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*2+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*2+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ + VSTI(ip, i*2+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*2+ 1, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_48(ip, op, parm) {\ + BITBLK256V32_48(ip, 0, op, parm);\ + BITBLK256V32_48(ip, 1, op, parm);\ + BITBLK256V32_48(ip, 2, op, parm);\ + BITBLK256V32_48(ip, 3, op, parm);\ + BITBLK256V32_48(ip, 4, op, parm);\ + BITBLK256V32_48(ip, 5, op, parm);\ + BITBLK256V32_48(ip, 6, op, parm);\ + BITBLK256V32_48(ip, 7, op, parm);\ + BITBLK256V32_48(ip, 8, op, parm);\ + BITBLK256V32_48(ip, 9, op, parm);\ + BITBLK256V32_48(ip, 10, op, parm);\ + BITBLK256V32_48(ip, 11, op, parm);\ + BITBLK256V32_48(ip, 12, op, parm);\ + BITBLK256V32_48(ip, 13, op, parm);\ + BITBLK256V32_48(ip, 14, op, parm);\ + BITBLK256V32_48(ip, 15, op, parm); IPPE(ip); OPPE(op += 48*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_49(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 1)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 31);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 3)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 29);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+31, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_49(ip, op, parm) {\ + BITBLK256V32_49(ip, 0, op, parm); IPPE(ip); OPPE(op += 49*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_50(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*16+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ + VSTI(ip, i*16+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*16+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ + VSTI(ip, i*16+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*16+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*16+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*16+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*16+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*16+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*16+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ + VSTI(ip, i*16+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+10, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*16+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+11, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ + VSTI(ip, i*16+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+12, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*16+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+13, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ + VSTI(ip, i*16+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+14, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*16+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+15, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_50(ip, op, parm) {\ + BITBLK256V32_50(ip, 0, op, parm);\ + BITBLK256V32_50(ip, 1, op, parm); IPPE(ip); OPPE(op += 50*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_51(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 3)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 29);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 1)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 31);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+31, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_51(ip, op, parm) {\ + BITBLK256V32_51(ip, 0, op, parm); IPPE(ip); OPPE(op += 51*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_52(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*8+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ + VSTI(ip, i*8+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 1, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*8+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 2, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*8+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 3, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*8+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 4, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*8+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 5, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ + VSTI(ip, i*8+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 6, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*8+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 7, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_52(ip, op, parm) {\ + BITBLK256V32_52(ip, 0, op, parm);\ + BITBLK256V32_52(ip, 1, op, parm);\ + BITBLK256V32_52(ip, 2, op, parm);\ + BITBLK256V32_52(ip, 3, op, parm); IPPE(ip); OPPE(op += 52*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_53(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 3)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 29);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 1)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 31);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+31, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_53(ip, op, parm) {\ + BITBLK256V32_53(ip, 0, op, parm); IPPE(ip); OPPE(op += 53*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_54(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*16+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ + VSTI(ip, i*16+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*16+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*16+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ + VSTI(ip, i*16+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*16+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ + VSTI(ip, i*16+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ + VSTI(ip, i*16+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*16+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*16+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ + VSTI(ip, i*16+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+10, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*16+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+11, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*16+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+12, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*16+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+13, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*16+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+14, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*16+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+15, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_54(ip, op, parm) {\ + BITBLK256V32_54(ip, 0, op, parm);\ + BITBLK256V32_54(ip, 1, op, parm); IPPE(ip); OPPE(op += 54*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_55(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 1)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 31);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 3)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 29);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+31, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_55(ip, op, parm) {\ + BITBLK256V32_55(ip, 0, op, parm); IPPE(ip); OPPE(op += 55*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_56(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*4+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*4+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ + VSTI(ip, i*4+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*4+ 1, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*4+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*4+ 2, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*4+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*4+ 3, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_56(ip, op, parm) {\ + BITBLK256V32_56(ip, 0, op, parm);\ + BITBLK256V32_56(ip, 1, op, parm);\ + BITBLK256V32_56(ip, 2, op, parm);\ + BITBLK256V32_56(ip, 3, op, parm);\ + BITBLK256V32_56(ip, 4, op, parm);\ + BITBLK256V32_56(ip, 5, op, parm);\ + BITBLK256V32_56(ip, 6, op, parm);\ + BITBLK256V32_56(ip, 7, op, parm); IPPE(ip); OPPE(op += 56*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_57(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 1)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 31);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 3)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 29);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+31, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_57(ip, op, parm) {\ + BITBLK256V32_57(ip, 0, op, parm); IPPE(ip); OPPE(op += 57*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_58(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*16+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ + VSTI(ip, i*16+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*16+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*16+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ + VSTI(ip, i*16+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*16+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ + VSTI(ip, i*16+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*16+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*16+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*16+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ + VSTI(ip, i*16+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+10, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ + VSTI(ip, i*16+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+11, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*16+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+12, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*16+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+13, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*16+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+14, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*16+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+15, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_58(ip, op, parm) {\ + BITBLK256V32_58(ip, 0, op, parm);\ + BITBLK256V32_58(ip, 1, op, parm); IPPE(ip); OPPE(op += 58*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_59(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 1)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 31);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 3)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 29);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+31, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_59(ip, op, parm) {\ + BITBLK256V32_59(ip, 0, op, parm); IPPE(ip); OPPE(op += 59*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_60(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*8+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ + VSTI(ip, i*8+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 1, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*8+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 2, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*8+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 3, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*8+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 4, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*8+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 5, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*8+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 6, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*8+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 7, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_60(ip, op, parm) {\ + BITBLK256V32_60(ip, 0, op, parm);\ + BITBLK256V32_60(ip, 1, op, parm);\ + BITBLK256V32_60(ip, 2, op, parm);\ + BITBLK256V32_60(ip, 3, op, parm); IPPE(ip); OPPE(op += 60*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_61(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 1)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 31);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+31, iv), 3)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 29); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_61(ip, op, parm) {\ + BITBLK256V32_61(ip, 0, op, parm); IPPE(ip); OPPE(op += 61*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_62(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*16+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ + VSTI(ip, i*16+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*16+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*16+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*16+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*16+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*16+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*16+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*16+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*16+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ + VSTI(ip, i*16+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+10, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*16+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+11, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ + VSTI(ip, i*16+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+12, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*16+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+13, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ + VSTI(ip, i*16+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+14, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ + VSTI(ip, i*16+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+15, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_62(ip, op, parm) {\ + BITBLK256V32_62(ip, 0, op, parm);\ + BITBLK256V32_62(ip, 1, op, parm); IPPE(ip); OPPE(op += 62*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_63(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*32+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ + VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ + VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ + VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ + VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ + VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ + VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ + VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ + VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ + VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ + VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ + VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ + VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ + VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ + VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ + VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ + VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ + VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ + VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ + VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ + VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ + VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ + VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ + VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ + VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ + VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ + VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ + VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27);\ + VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ + VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 3)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 29);\ + VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ + VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+31, iv), 1)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 31); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_63(ip, op, parm) {\ + BITBLK256V32_63(ip, 0, op, parm); IPPE(ip); OPPE(op += 63*4/sizeof(op[0]));\ +} + +#define BITBLK256V32_64(ip, i, op, parm) { __m256i ov,iv;\ + VSTI(ip, i*1+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*1+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32); _mm256_storeu_si256((__m128i *)op++, ov);\ +} + +#define BITPACK256V32_64(ip, op, parm) {\ + BITBLK256V32_64(ip, 0, op, parm);\ + BITBLK256V32_64(ip, 1, op, parm);\ + BITBLK256V32_64(ip, 2, op, parm);\ + BITBLK256V32_64(ip, 3, op, parm);\ + BITBLK256V32_64(ip, 4, op, parm);\ + BITBLK256V32_64(ip, 5, op, parm);\ + BITBLK256V32_64(ip, 6, op, parm);\ + BITBLK256V32_64(ip, 7, op, parm);\ + BITBLK256V32_64(ip, 8, op, parm);\ + BITBLK256V32_64(ip, 9, op, parm);\ + BITBLK256V32_64(ip, 10, op, parm);\ + BITBLK256V32_64(ip, 11, op, parm);\ + BITBLK256V32_64(ip, 12, op, parm);\ + BITBLK256V32_64(ip, 13, op, parm);\ + BITBLK256V32_64(ip, 14, op, parm);\ + BITBLK256V32_64(ip, 15, op, parm);\ + BITBLK256V32_64(ip, 16, op, parm);\ + BITBLK256V32_64(ip, 17, op, parm);\ + BITBLK256V32_64(ip, 18, op, parm);\ + BITBLK256V32_64(ip, 19, op, parm);\ + BITBLK256V32_64(ip, 20, op, parm);\ + BITBLK256V32_64(ip, 21, op, parm);\ + BITBLK256V32_64(ip, 22, op, parm);\ + BITBLK256V32_64(ip, 23, op, parm);\ + BITBLK256V32_64(ip, 24, op, parm);\ + BITBLK256V32_64(ip, 25, op, parm);\ + BITBLK256V32_64(ip, 26, op, parm);\ + BITBLK256V32_64(ip, 27, op, parm);\ + BITBLK256V32_64(ip, 28, op, parm);\ + BITBLK256V32_64(ip, 29, op, parm);\ + BITBLK256V32_64(ip, 30, op, parm);\ + BITBLK256V32_64(ip, 31, op, parm); IPPE(ip); OPPE(op += 64*4/sizeof(op[0]));\ +} + diff --git a/bitpack64_.h b/bitpack64_.h index 1fc0605..3743d7c 100644 --- a/bitpack64_.h +++ b/bitpack64_.h @@ -1,5 +1,5 @@ /** - Copyright (C) powturbo 2013-2015 + Copyright (C) powturbo 2013-2017 GPL v2 License This program is free software; you can redistribute it and/or modify @@ -2256,3 +2256,4 @@ BITBLK64_64(ip, 30, op, parm);\ BITBLK64_64(ip, 31, op, parm); SRCI(ip); op += 64*4/sizeof(op[0]);\ } + diff --git a/bitpackv.c b/bitpackv.c index 06f4d37..fb854f6 100644 --- a/bitpackv.c +++ b/bitpackv.c @@ -21,7 +21,7 @@ - twitter : https://twitter.com/powturbo - email : powturbo [_AT_] gmail [_DOT_] com **/ -// bitpackv.c - "Integer Compression" SIMD bit packing +// "Integer Compression" SIMD bit packing #ifndef VSTI #include #include "bitpack.h" diff --git a/bitpackv32_.h b/bitpackv32_.h index 702775e..34bbc34 100644 --- a/bitpackv32_.h +++ b/bitpackv32_.h @@ -21,7 +21,7 @@ - twitter : https://twitter.com/powturbo - email : powturbo [_AT_] gmail [_DOT_] com **/ -// bitpackv32.h - "Integer Compression" simd bit packing +// "Integer Compression" simd bit packing #define BITBLKV32_1(ip, i, op, parm) { __m128i ov,iv;\ VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 1, iv), 1));\ diff --git a/bitunpack.c b/bitunpack.c index 0fe8f5d..c7dd63a 100644 --- a/bitunpack.c +++ b/bitunpack.c @@ -21,7 +21,7 @@ - twitter : https://twitter.com/powturbo - email : powturbo [_AT_] gmail [_DOT_] com **/ -// bitunpack_.h - "Integer Compression" Bit Packing +// "Integer Compression" Bit Packing #ifndef BPI #include "conf.h" diff --git a/bitunpack.h b/bitunpack.h index 01773be..a2e75ab 100644 --- a/bitunpack.h +++ b/bitunpack.h @@ -1,5 +1,5 @@ /** - Copyright (C) powturbo 2013-2015 + Copyright (C) powturbo 2013-2017 GPL v2 License This program is free software; you can redistribute it and/or modify @@ -21,7 +21,7 @@ - twitter : https://twitter.com/powturbo - email : powturbo [_AT_] gmail [_DOT_] com **/ -// bitunpack.h - "Integer Compression" Binary Packing +// "Integer Compression" Bit Packing #ifdef __cplusplus extern "C" { @@ -37,24 +37,17 @@ unsigned char *bitunpack32(const unsigned char *__restrict in, unsigned n, unsig unsigned char *bitunpack64(const unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, unsigned b); // ---------------- Direct Access to a single packed integer array entry -------------------------------------------------------------------- - #ifdef __AVX2__ -#include - #else -#define _bzhi_u64(__u, __b) ((__u) & ((1ull<<__b)-1)) -#define _bzhi_u32(__u, __b) ((__u) & ((1u <<__b)-1)) - #endif - // Get a single 32 bits value with index "idx" (or bit index b*idx) from packed integer array -static ALWAYS_INLINE unsigned bitgetx32(const unsigned char *__restrict in, unsigned b, unsigned idx) { unsigned bidx = b*idx; return _bzhi_u64( (*(unsigned long long *)((unsigned *)in+(bidx>>5))) >> (bidx&0x1f), b ); } -static ALWAYS_INLINE unsigned _bitgetx32(const unsigned char *__restrict in, unsigned b, unsigned bidx) { return _bzhi_u64( (*(unsigned long long *)((unsigned *)in+(bidx>>5))) >> (bidx&0x1f), b ); } - +static ALWAYS_INLINE unsigned bitgetx32(const unsigned char *__restrict in, unsigned idx, unsigned b) { unsigned bidx = b*idx; return bzhi64( ctou64((unsigned *)in+(bidx>>5)) >> (bidx&0x1f), b ); } +static ALWAYS_INLINE unsigned _bitgetx32(const unsigned char *__restrict in, unsigned bidx, unsigned b) { return bzhi64( ctou64((unsigned *)in+(bidx>>5)) >> (bidx&0x1f), b ); } + // like bitgetx32 but for 16 bits integer array -static ALWAYS_INLINE unsigned bitgetx16(const unsigned char *__restrict in, unsigned b, unsigned idx) { unsigned bidx = b*idx; return _bzhi_u32( (*(unsigned *)((unsigned *)in+(bidx>>4))) >> (bidx& 0xf), b ); } -static ALWAYS_INLINE unsigned _bitgetx16(const unsigned char *__restrict in, unsigned b, unsigned bidx) { return _bzhi_u32( (*(unsigned *)((unsigned *)in+(bidx>>4))) >> (bidx& 0xf), b ); } +static ALWAYS_INLINE unsigned bitgetx16(const unsigned char *__restrict in, unsigned idx, unsigned b) { unsigned bidx = b*idx; return bzhi32( ctou32((unsigned *)in+(bidx>>4)) >> (bidx& 0xf), b ); } +static ALWAYS_INLINE unsigned _bitgetx16(const unsigned char *__restrict in, unsigned bidx, unsigned b) { return bzhi32( ctou32((unsigned *)in+(bidx>>4)) >> (bidx& 0xf), b ); } // Set a single value with index "idx" -static ALWAYS_INLINE void bitsetx16(const unsigned char *__restrict in, unsigned b, unsigned idx, unsigned v) { unsigned bidx = b*idx; unsigned *p = (unsigned *) in+(bidx>>4) ; *p = ( *p & ~(((1u <>5)); *p = ( *p & ~(((1ull<>4) ; *p = ( *p & ~(((1u <>5)); *p = ( *p & ~(((1ull< @@ -36,9 +36,9 @@ //----------------------------------------------------------------------------- #define VSTO( _op_, _i_, ov, _parm_) _mm_storeu_si128(_op_++, ov) #define VSTO0(_op_, _i_, ov, _parm_) _mm_storeu_si128(_op_++, _parm_) -#include "bitunpackv.c" +#include "bitunpack128v.c" -#define BITUNBLKV32_0(ip, _i_, _op_, _parm_) {__m128i ov;\ +#define BITUNBLK128V32_0(ip, _i_, _op_, _parm_) {__m128i ov;\ VSTO0(_op_, 0, ov, _parm_);\ VSTO0(_op_, 1, ov, _parm_);\ VSTO0(_op_, 2, ov, _parm_);\ @@ -74,11 +74,11 @@ } #define BITUNPACK0(_parm_) _parm_ = _mm_setzero_si128() -unsigned char *bitunpackv32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b) { - const unsigned char *ip = in+PAD8(n*b); +unsigned char *bitunpack128v32( const unsigned char *__restrict in, unsigned *__restrict out, unsigned b) { + const unsigned char *ip = in+PAD8(128*b); __m128i sv; - BITUNPACKV32(in, n, b, out, sv); - return (unsigned char *)ip; + BITUNPACK128V32(in, b, out, sv); + return (unsigned char *)ip; } #undef VSTO #undef VSTO0 @@ -110,12 +110,12 @@ static ALIGNED(char, shuffles[16][16], 16) = { #define VSTO( _op_, _i_, _ov_, _parm_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _mm_storeu_si128(_op_++, _mm_add_epi32(_ov_, _mm_shuffle_epi8(_mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), b), _mm_load_si128((__m128i*)shuffles[m]) ) )); pex += popcnt32(m) #define VSTO0(_op_, _i_, ov, _parm_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _mm_storeu_si128(_op_++, _mm_shuffle_epi8( _mm_loadu_si128((__m128i*)pex), _mm_load_si128((__m128i*)shuffles[m]) ) ); pex += popcnt32(m) #define BITUNPACK0(_parm_) //_parm_ = _mm_setzero_si128() -#include "bitunpackv.c" +#include "bitunpack128v.c" -unsigned char *_bitunpackv32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb) { - const unsigned char *ip = in+PAD8(n*b); unsigned m; +unsigned char *_bitunpack128v32( const unsigned char *__restrict in, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb) { + const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv; - BITUNPACKV32(in, n, b, out, sv); + BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip; } #undef VSTO @@ -125,30 +125,30 @@ unsigned char *_bitunpackv32( const unsigned char *__restrict in, unsigned n, un //----------------------------------------------------------------------------- #define VSTO0(_op_, _i_, ov, _parm_) _mm_storeu_si128(_op_++, _parm_) -#define VSTO(__op, i, __ov, __sv) __ov = UNZIGZAG128_32(__ov); SCAN128_32(__ov,__sv); _mm_storeu_si128(__op++, __sv) -#include "bitunpackv.c" +#define VSTO(__op, i, __ov, __sv) __ov = UNZIGZAG128x32(__ov); SCAN128x32(__ov,__sv); _mm_storeu_si128(__op++, __sv) +#include "bitunpack128v.c" #define BITUNPACK0(_parm_) -unsigned char *bitzunpackv32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { - const unsigned char *ip = in+PAD8(n*b); +unsigned char *bitzunpack128v32( const unsigned char *__restrict in, unsigned *__restrict out, unsigned start, unsigned b) { + const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi32(start); - BITUNPACKV32(in, n, b, out, sv); + BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip; } #undef VSTO #undef BITUNPACK0 //----------------------------------------------------------------------------- -#define VSTO(__op, i, __ov, __sv) SCAN128_32(__ov,__sv); _mm_storeu_si128(__op++, __sv) -#include "bitunpackv.c" +#define VSTO(__op, i, __ov, __sv) SCAN128x32(__ov,__sv); _mm_storeu_si128(__op++, __sv) +#include "bitunpack128v.c" #define BITUNPACK0(_parm_) -unsigned char *bitdunpackv32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { - const unsigned char *ip = in+PAD8(n*b); +unsigned char *bitdunpack128v32( const unsigned char *__restrict in, unsigned *__restrict out, unsigned start, unsigned b) { + const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi32(start); - BITUNPACKV32(in, n, b, out, sv); + BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip; } #undef VSTO @@ -158,19 +158,19 @@ unsigned char *bitdunpackv32( const unsigned char *__restrict in, unsigned n, un //----------------------------------------------------------------------------- #ifdef __SSSE3__ #define VEXP(_i_, _ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_add_epi32(_ov_, _mm_shuffle_epi8(_mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), b), _mm_load_si128((__m128i*)shuffles[m]) ) ); pex += popcnt32(m) -#define VSTO( _op_, _i_, _ov_, _sv_) VEXP( _i_, _ov_); SCAN128_32(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_); +#define VSTO( _op_, _i_, _ov_, _sv_) VEXP( _i_, _ov_); SCAN128x32(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_); #define VEXP0(_i_, _ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)pex),_mm_load_si128((__m128i*)shuffles[m]) ); pex += popcnt32(m) -#define VSTO0(_op_, _i_, _ov_, _sv_) VEXP0( _i_, _ov_); SCAN128_32(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_); +#define VSTO0(_op_, _i_, _ov_, _sv_) VEXP0( _i_, _ov_); SCAN128x32(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_); -#include "bitunpackv.c" +#include "bitunpack128v.c" #define BITUNPACK0(_parm_) -unsigned char *_bitdunpackv32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) { - const unsigned char *ip = in+PAD8(n*b); unsigned m; +unsigned char *_bitdunpack128v32( const unsigned char *__restrict in, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) { + const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv = _mm_set1_epi32(start); - BITUNPACKV32(in, n, b, out, sv); + BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip; } #undef VSTO @@ -178,16 +178,16 @@ unsigned char *_bitdunpackv32( const unsigned char *__restrict in, unsigned n, u #undef BITUNPACK0 #endif //----------------------------------------------------------------------------- -#define VSTO(__op, i, __ov, __sv) SCANI128_32(__ov,__sv,cv); _mm_storeu_si128(__op++, __sv); +#define VSTO(__op, i, __ov, __sv) SCANI128x32(__ov,__sv,cv); _mm_storeu_si128(__op++, __sv); #define VSTO0(_op_, _i_, ov, _parm_) _mm_storeu_si128(_op_++, _parm_); _parm_ = _mm_add_epi32(_parm_, cv) -#include "bitunpackv.c" +#include "bitunpack128v.c" #define BITUNPACK0(_parm_) _parm_ = _mm_add_epi32(_parm_, cv); cv = _mm_set1_epi32(4) -unsigned char *bitd1unpackv32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { - const unsigned char *ip = in+PAD8(n*b); +unsigned char *bitd1unpack128v32( const unsigned char *__restrict in, unsigned *__restrict out, unsigned start, unsigned b) { + const unsigned char *ip = in+PAD8(128*b); __m128i sv = _mm_set1_epi32(start), cv = _mm_set_epi32(4,3,2,1); - BITUNPACKV32(in, n, b, out, sv); + BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip; } #undef VSTO @@ -196,19 +196,19 @@ unsigned char *bitd1unpackv32( const unsigned char *__restrict in, unsigned n, u //----------------------------------------------------------------------------- #ifdef __SSSE3__ #define VEXP(_i_, _ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_add_epi32(_ov_, _mm_shuffle_epi8(_mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), b), _mm_load_si128((__m128i*)shuffles[m]) ) ); pex += popcnt32(m) -#define VSTO( _op_, _i_, _ov_, _sv_) VEXP( _i_, _ov_); SCANI128_32(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_); +#define VSTO( _op_, _i_, _ov_, _sv_) VEXP( _i_, _ov_); SCANI128x32(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_); #define VEXP0(_i_, _ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)pex),_mm_load_si128((__m128i*)shuffles[m]) ); pex += popcnt32(m) -#define VSTO0(_op_, _i_, _ov_, _sv_) VEXP0( _i_, _ov_); SCANI128_32(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_); +#define VSTO0(_op_, _i_, _ov_, _sv_) VEXP0( _i_, _ov_); SCANI128x32(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_); -#include "bitunpackv.c" +#include "bitunpack128v.c" #define BITUNPACK0(_parm_) mv = _mm_set1_epi32(0) //_parm_ = _mm_setzero_si128() -unsigned char *_bitd1unpackv32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) { - const unsigned char *ip = in+PAD8(n*b); unsigned m; +unsigned char *_bitd1unpack128v32( const unsigned char *__restrict in, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) { + const unsigned char *ip = in+PAD8(128*b); unsigned m; __m128i sv = _mm_set1_epi32(start), cv = _mm_set_epi32(4,3,2,1); - BITUNPACKV32(in, n, b, out, sv); + BITUNPACK128V32(in, b, out, sv); return (unsigned char *)ip; } #undef VSTO @@ -217,44 +217,46 @@ unsigned char *_bitd1unpackv32( const unsigned char *__restrict in, unsigned n, #endif #else -#include "bitunpackv32_.h" +#include "bitunpack128v_.h" -#define BITUNPACKV32(__ip, __n, __nbits, __op, _parm_) { __m128i mv,*_ov=(__m128i *)__op,*_iv=(__m128i *)__ip; \ +#define BITUNPACK128V32(__ip, __nbits, __op, _parm_) { __m128i mv,*_ov=(__m128i *)__op,*_iv=(__m128i *)__ip; \ switch(__nbits&0x3f) {\ - case 0: BITUNPACK0(_parm_); BITUNPACKV32_0( _iv, _ov, _parm_); break;\ - case 1: mv = _mm_set1_epi32((1u<< 1)-1); BITUNPACKV32_1( _iv, _ov, _parm_); break;\ - case 2: mv = _mm_set1_epi32((1u<< 2)-1); BITUNPACKV32_2( _iv, _ov, _parm_); break;\ - case 3: mv = _mm_set1_epi32((1u<< 3)-1); BITUNPACKV32_3( _iv, _ov, _parm_); break;\ - case 4: mv = _mm_set1_epi32((1u<< 4)-1); BITUNPACKV32_4( _iv, _ov, _parm_); break;\ - case 5: mv = _mm_set1_epi32((1u<< 5)-1); BITUNPACKV32_5( _iv, _ov, _parm_); break;\ - case 6: mv = _mm_set1_epi32((1u<< 6)-1); BITUNPACKV32_6( _iv, _ov, _parm_); break;\ - case 7: mv = _mm_set1_epi32((1u<< 7)-1); BITUNPACKV32_7( _iv, _ov, _parm_); break;\ - case 8: mv = _mm_set1_epi32((1u<< 8)-1); BITUNPACKV32_8( _iv, _ov, _parm_); break;\ - case 9: mv = _mm_set1_epi32((1u<< 9)-1); BITUNPACKV32_9( _iv, _ov, _parm_); break;\ - case 10: mv = _mm_set1_epi32((1u<<10)-1); BITUNPACKV32_10(_iv, _ov, _parm_); break;\ - case 11: mv = _mm_set1_epi32((1u<<11)-1); BITUNPACKV32_11(_iv, _ov, _parm_); break;\ - case 12: mv = _mm_set1_epi32((1u<<12)-1); BITUNPACKV32_12(_iv, _ov, _parm_); break;\ - case 13: mv = _mm_set1_epi32((1u<<13)-1); BITUNPACKV32_13(_iv, _ov, _parm_); break;\ - case 14: mv = _mm_set1_epi32((1u<<14)-1); BITUNPACKV32_14(_iv, _ov, _parm_); break;\ - case 15: mv = _mm_set1_epi32((1u<<15)-1); BITUNPACKV32_15(_iv, _ov, _parm_); break;\ - case 16: mv = _mm_set1_epi32((1u<<16)-1); BITUNPACKV32_16(_iv, _ov, _parm_); break;\ - case 17: mv = _mm_set1_epi32((1u<<17)-1); BITUNPACKV32_17(_iv, _ov, _parm_); break;\ - case 18: mv = _mm_set1_epi32((1u<<18)-1); BITUNPACKV32_18(_iv, _ov, _parm_); break;\ - case 19: mv = _mm_set1_epi32((1u<<19)-1); BITUNPACKV32_19(_iv, _ov, _parm_); break;\ - case 20: mv = _mm_set1_epi32((1u<<20)-1); BITUNPACKV32_20(_iv, _ov, _parm_); break;\ - case 21: mv = _mm_set1_epi32((1u<<21)-1); BITUNPACKV32_21(_iv, _ov, _parm_); break;\ - case 22: mv = _mm_set1_epi32((1u<<22)-1); BITUNPACKV32_22(_iv, _ov, _parm_); break;\ - case 23: mv = _mm_set1_epi32((1u<<23)-1); BITUNPACKV32_23(_iv, _ov, _parm_); break;\ - case 24: mv = _mm_set1_epi32((1u<<24)-1); BITUNPACKV32_24(_iv, _ov, _parm_); break;\ - case 25: mv = _mm_set1_epi32((1u<<25)-1); BITUNPACKV32_25(_iv, _ov, _parm_); break;\ - case 26: mv = _mm_set1_epi32((1u<<26)-1); BITUNPACKV32_26(_iv, _ov, _parm_); break;\ - case 27: mv = _mm_set1_epi32((1u<<27)-1); BITUNPACKV32_27(_iv, _ov, _parm_); break;\ - case 28: mv = _mm_set1_epi32((1u<<28)-1); BITUNPACKV32_28(_iv, _ov, _parm_); break;\ - case 29: mv = _mm_set1_epi32((1u<<29)-1); BITUNPACKV32_29(_iv, _ov, _parm_); break;\ - case 30: mv = _mm_set1_epi32((1u<<30)-1); BITUNPACKV32_30(_iv, _ov, _parm_); break;\ - case 31: mv = _mm_set1_epi32((1u<<31)-1); BITUNPACKV32_31(_iv, _ov, _parm_); break;\ - case 32: mv = _mm_set1_epi32((1ull<<32)-1);BITUNPACKV32_32(_iv, _ov, _parm_); break;\ + case 0: BITUNPACK0(_parm_); BITUNPACK128V32_0( _iv, _ov, _parm_); break;\ + case 1: mv = _mm_set1_epi32((1u<< 1)-1); BITUNPACK128V32_1( _iv, _ov, _parm_); break;\ + case 2: mv = _mm_set1_epi32((1u<< 2)-1); BITUNPACK128V32_2( _iv, _ov, _parm_); break;\ + case 3: mv = _mm_set1_epi32((1u<< 3)-1); BITUNPACK128V32_3( _iv, _ov, _parm_); break;\ + case 4: mv = _mm_set1_epi32((1u<< 4)-1); BITUNPACK128V32_4( _iv, _ov, _parm_); break;\ + case 5: mv = _mm_set1_epi32((1u<< 5)-1); BITUNPACK128V32_5( _iv, _ov, _parm_); break;\ + case 6: mv = _mm_set1_epi32((1u<< 6)-1); BITUNPACK128V32_6( _iv, _ov, _parm_); break;\ + case 7: mv = _mm_set1_epi32((1u<< 7)-1); BITUNPACK128V32_7( _iv, _ov, _parm_); break;\ + case 8: mv = _mm_set1_epi32((1u<< 8)-1); BITUNPACK128V32_8( _iv, _ov, _parm_); break;\ + case 9: mv = _mm_set1_epi32((1u<< 9)-1); BITUNPACK128V32_9( _iv, _ov, _parm_); break;\ + case 10: mv = _mm_set1_epi32((1u<<10)-1); BITUNPACK128V32_10(_iv, _ov, _parm_); break;\ + case 11: mv = _mm_set1_epi32((1u<<11)-1); BITUNPACK128V32_11(_iv, _ov, _parm_); break;\ + case 12: mv = _mm_set1_epi32((1u<<12)-1); BITUNPACK128V32_12(_iv, _ov, _parm_); break;\ + case 13: mv = _mm_set1_epi32((1u<<13)-1); BITUNPACK128V32_13(_iv, _ov, _parm_); break;\ + case 14: mv = _mm_set1_epi32((1u<<14)-1); BITUNPACK128V32_14(_iv, _ov, _parm_); break;\ + case 15: mv = _mm_set1_epi32((1u<<15)-1); BITUNPACK128V32_15(_iv, _ov, _parm_); break;\ + case 16: mv = _mm_set1_epi32((1u<<16)-1); BITUNPACK128V32_16(_iv, _ov, _parm_); break;\ + case 17: mv = _mm_set1_epi32((1u<<17)-1); BITUNPACK128V32_17(_iv, _ov, _parm_); break;\ + case 18: mv = _mm_set1_epi32((1u<<18)-1); BITUNPACK128V32_18(_iv, _ov, _parm_); break;\ + case 19: mv = _mm_set1_epi32((1u<<19)-1); BITUNPACK128V32_19(_iv, _ov, _parm_); break;\ + case 20: mv = _mm_set1_epi32((1u<<20)-1); BITUNPACK128V32_20(_iv, _ov, _parm_); break;\ + case 21: mv = _mm_set1_epi32((1u<<21)-1); BITUNPACK128V32_21(_iv, _ov, _parm_); break;\ + case 22: mv = _mm_set1_epi32((1u<<22)-1); BITUNPACK128V32_22(_iv, _ov, _parm_); break;\ + case 23: mv = _mm_set1_epi32((1u<<23)-1); BITUNPACK128V32_23(_iv, _ov, _parm_); break;\ + case 24: mv = _mm_set1_epi32((1u<<24)-1); BITUNPACK128V32_24(_iv, _ov, _parm_); break;\ + case 25: mv = _mm_set1_epi32((1u<<25)-1); BITUNPACK128V32_25(_iv, _ov, _parm_); break;\ + case 26: mv = _mm_set1_epi32((1u<<26)-1); BITUNPACK128V32_26(_iv, _ov, _parm_); break;\ + case 27: mv = _mm_set1_epi32((1u<<27)-1); BITUNPACK128V32_27(_iv, _ov, _parm_); break;\ + case 28: mv = _mm_set1_epi32((1u<<28)-1); BITUNPACK128V32_28(_iv, _ov, _parm_); break;\ + case 29: mv = _mm_set1_epi32((1u<<29)-1); BITUNPACK128V32_29(_iv, _ov, _parm_); break;\ + case 30: mv = _mm_set1_epi32((1u<<30)-1); BITUNPACK128V32_30(_iv, _ov, _parm_); break;\ + case 31: mv = _mm_set1_epi32((1u<<31)-1); BITUNPACK128V32_31(_iv, _ov, _parm_); break;\ + case 32: mv = _mm_set1_epi32((1ull<<32)-1);BITUNPACK128V32_32(_iv, _ov, _parm_); break;\ case 33 ... 63: break;\ }\ -} +} #endif + + diff --git a/bitunpack128v_.h b/bitunpack128v_.h new file mode 100644 index 0000000..9c66475 --- /dev/null +++ b/bitunpack128v_.h @@ -0,0 +1,2002 @@ +/** + Copyright (C) powturbo 2013-2017 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - homepage : https://sites.google.com/site/powturbo/ + - github : https://github.com/powturbo + - twitter : https://twitter.com/powturbo + - email : powturbo [_AT_] gmail [_DOT_] com +**/ +// TurboPFor: Integer Compression SIMD bit unpacking +#define BITUNPACK128V32_0(ip, op, parm) {\ + BITUNBLK128V32_0(ip, 0, op, parm);\ +} + +#define BITUNBLK128V32_1(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VSTO(op,i*32+ 1,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*32+ 2,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VSTO(op,i*32+ 3,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*32+ 4,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VSTO(op,i*32+ 5,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*32+ 6,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VSTO(op,i*32+ 7,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*32+ 8,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 9),mv); VSTO(op,i*32+ 9,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VSTO(op,i*32+10,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 11),mv); VSTO(op,i*32+11,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VSTO(op,i*32+12,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 13),mv); VSTO(op,i*32+13,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VSTO(op,i*32+14,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 15),mv); VSTO(op,i*32+15,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VSTO(op,i*32+16,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 17),mv); VSTO(op,i*32+17,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 18),mv); VSTO(op,i*32+18,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 19),mv); VSTO(op,i*32+19,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 20),mv); VSTO(op,i*32+20,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 21),mv); VSTO(op,i*32+21,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 22),mv); VSTO(op,i*32+22,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 23),mv); VSTO(op,i*32+23,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 24),mv); VSTO(op,i*32+24,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 25),mv); VSTO(op,i*32+25,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 26),mv); VSTO(op,i*32+26,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 27),mv); VSTO(op,i*32+27,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 28),mv); VSTO(op,i*32+28,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 29),mv); VSTO(op,i*32+29,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 30),mv); VSTO(op,i*32+30,ov,parm); \ + ov = _mm_srli_epi32(iv, 31); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACK128V32_1(ip, op, parm) {\ + BITUNBLK128V32_1(ip, 0, op, parm);\ +} + +#define BITUNBLK128V32_2(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv ,mv); VSTO(op,i*16+ 0,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*16+ 1,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*16+ 2,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*16+ 3,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*16+ 4,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VSTO(op,i*16+ 5,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VSTO(op,i*16+ 6,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VSTO(op,i*16+ 7,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VSTO(op,i*16+ 8,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 18),mv); VSTO(op,i*16+ 9,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 20),mv); VSTO(op,i*16+10,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 22),mv); VSTO(op,i*16+11,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 24),mv); VSTO(op,i*16+12,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 26),mv); VSTO(op,i*16+13,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 28),mv); VSTO(op,i*16+14,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); VSTO(op,i*16+15,ov,parm); ;\ +} + +#define BITUNPACK128V32_2(ip, op, parm) {\ + BITUNBLK128V32_2(ip, 0, op, parm);\ + BITUNBLK128V32_2(ip, 1, op, parm);\ +} + +#define BITUNBLK128V32_3(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VSTO(op,i*32+ 1,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*32+ 2,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 9),mv); VSTO(op,i*32+ 3,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VSTO(op,i*32+ 4,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 15),mv); VSTO(op,i*32+ 5,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 18),mv); VSTO(op,i*32+ 6,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 21),mv); VSTO(op,i*32+ 7,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 24),mv); VSTO(op,i*32+ 8,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 27),mv); VSTO(op,i*32+ 9,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VSTO(op,i*32+11,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*32+12,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VSTO(op,i*32+13,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VSTO(op,i*32+14,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 13),mv); VSTO(op,i*32+15,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VSTO(op,i*32+16,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 19),mv); VSTO(op,i*32+17,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 22),mv); VSTO(op,i*32+18,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 25),mv); VSTO(op,i*32+19,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 28),mv); VSTO(op,i*32+20,ov,parm); \ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*32+22,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VSTO(op,i*32+23,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*32+24,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 11),mv); VSTO(op,i*32+25,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VSTO(op,i*32+26,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 17),mv); VSTO(op,i*32+27,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 20),mv); VSTO(op,i*32+28,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 23),mv); VSTO(op,i*32+29,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 26),mv); VSTO(op,i*32+30,ov,parm); \ + ov = _mm_srli_epi32(iv, 29); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACK128V32_3(ip, op, parm) {\ + BITUNBLK128V32_3(ip, 0, op, parm);\ +} + +#define BITUNBLK128V32_4(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv ,mv); VSTO(op,i*8+ 0,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*8+ 1,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*8+ 2,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VSTO(op,i*8+ 3,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VSTO(op,i*8+ 4,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 20),mv); VSTO(op,i*8+ 5,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 24),mv); VSTO(op,i*8+ 6,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); VSTO(op,i*8+ 7,ov,parm); ;\ +} + +#define BITUNPACK128V32_4(ip, op, parm) {\ + BITUNBLK128V32_4(ip, 0, op, parm);\ + BITUNBLK128V32_4(ip, 1, op, parm);\ + BITUNBLK128V32_4(ip, 2, op, parm);\ + BITUNBLK128V32_4(ip, 3, op, parm);\ +} + +#define BITUNBLK128V32_5(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VSTO(op,i*32+ 1,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VSTO(op,i*32+ 2,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 15),mv); VSTO(op,i*32+ 3,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 20),mv); VSTO(op,i*32+ 4,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 25),mv); VSTO(op,i*32+ 5,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VSTO(op,i*32+ 7,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*32+ 8,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 13),mv); VSTO(op,i*32+ 9,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 18),mv); VSTO(op,i*32+10,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 23),mv); VSTO(op,i*32+11,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VSTO(op,i*32+13,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*32+14,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 11),mv); VSTO(op,i*32+15,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VSTO(op,i*32+16,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 21),mv); VSTO(op,i*32+17,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 26),mv); VSTO(op,i*32+18,ov,parm); \ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*32+20,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 9),mv); VSTO(op,i*32+21,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VSTO(op,i*32+22,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 19),mv); VSTO(op,i*32+23,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 24),mv); VSTO(op,i*32+24,ov,parm); \ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*32+26,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VSTO(op,i*32+27,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VSTO(op,i*32+28,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 17),mv); VSTO(op,i*32+29,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 22),mv); VSTO(op,i*32+30,ov,parm); \ + ov = _mm_srli_epi32(iv, 27); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACK128V32_5(ip, op, parm) {\ + BITUNBLK128V32_5(ip, 0, op, parm);\ +} + +#define BITUNBLK128V32_6(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv ,mv); VSTO(op,i*16+ 0,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*16+ 1,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VSTO(op,i*16+ 2,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 18),mv); VSTO(op,i*16+ 3,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 24),mv); VSTO(op,i*16+ 4,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 5,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*16+ 6,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VSTO(op,i*16+ 7,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VSTO(op,i*16+ 8,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 22),mv); VSTO(op,i*16+ 9,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+10,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*16+11,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*16+12,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VSTO(op,i*16+13,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 20),mv); VSTO(op,i*16+14,ov,parm); \ + ov = _mm_srli_epi32(iv, 26); VSTO(op,i*16+15,ov,parm); ;\ +} + +#define BITUNPACK128V32_6(ip, op, parm) {\ + BITUNBLK128V32_6(ip, 0, op, parm);\ + BITUNBLK128V32_6(ip, 1, op, parm);\ +} + +#define BITUNBLK128V32_7(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VSTO(op,i*32+ 1,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VSTO(op,i*32+ 2,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 21),mv); VSTO(op,i*32+ 3,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VSTO(op,i*32+ 5,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VSTO(op,i*32+ 6,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 17),mv); VSTO(op,i*32+ 7,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 24),mv); VSTO(op,i*32+ 8,ov,parm); \ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*32+10,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 13),mv); VSTO(op,i*32+11,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 20),mv); VSTO(op,i*32+12,ov,parm); \ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*32+14,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 9),mv); VSTO(op,i*32+15,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VSTO(op,i*32+16,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 23),mv); VSTO(op,i*32+17,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VSTO(op,i*32+19,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VSTO(op,i*32+20,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 19),mv); VSTO(op,i*32+21,ov,parm); \ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VSTO(op,i*32+23,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*32+24,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 15),mv); VSTO(op,i*32+25,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 22),mv); VSTO(op,i*32+26,ov,parm); \ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*32+28,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 11),mv); VSTO(op,i*32+29,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 18),mv); VSTO(op,i*32+30,ov,parm); \ + ov = _mm_srli_epi32(iv, 25); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACK128V32_7(ip, op, parm) {\ + BITUNBLK128V32_7(ip, 0, op, parm);\ +} + +#define BITUNBLK128V32_8(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv ,mv); VSTO(op,i*4+ 0,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*4+ 1,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VSTO(op,i*4+ 2,ov,parm); \ + ov = _mm_srli_epi32(iv, 24); VSTO(op,i*4+ 3,ov,parm); ;\ +} + +#define BITUNPACK128V32_8(ip, op, parm) {\ + BITUNBLK128V32_8(ip, 0, op, parm);\ + BITUNBLK128V32_8(ip, 1, op, parm);\ + BITUNBLK128V32_8(ip, 2, op, parm);\ + BITUNBLK128V32_8(ip, 3, op, parm);\ + BITUNBLK128V32_8(ip, 4, op, parm);\ + BITUNBLK128V32_8(ip, 5, op, parm);\ + BITUNBLK128V32_8(ip, 6, op, parm);\ + BITUNBLK128V32_8(ip, 7, op, parm);\ +} + +#define BITUNBLK128V32_9(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 9),mv); VSTO(op,i*32+ 1,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 18),mv); VSTO(op,i*32+ 2,ov,parm); \ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*32+ 4,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 13),mv); VSTO(op,i*32+ 5,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 22),mv); VSTO(op,i*32+ 6,ov,parm); \ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*32+ 8,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 17),mv); VSTO(op,i*32+ 9,ov,parm); \ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VSTO(op,i*32+11,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VSTO(op,i*32+12,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 21),mv); VSTO(op,i*32+13,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VSTO(op,i*32+15,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VSTO(op,i*32+16,ov,parm); \ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*32+18,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 11),mv); VSTO(op,i*32+19,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 20),mv); VSTO(op,i*32+20,ov,parm); \ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*32+22,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 15),mv); VSTO(op,i*32+23,ov,parm); \ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VSTO(op,i*32+25,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VSTO(op,i*32+26,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 19),mv); VSTO(op,i*32+27,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VSTO(op,i*32+29,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VSTO(op,i*32+30,ov,parm); \ + ov = _mm_srli_epi32(iv, 23); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACK128V32_9(ip, op, parm) {\ + BITUNBLK128V32_9(ip, 0, op, parm);\ +} + +#define BITUNBLK128V32_10(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv ,mv); VSTO(op,i*16+ 0,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VSTO(op,i*16+ 1,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 20),mv); VSTO(op,i*16+ 2,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 3,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*16+ 4,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 18),mv); VSTO(op,i*16+ 5,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 6,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*16+ 7,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VSTO(op,i*16+ 8,ov,parm); \ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 9,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*16+10,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VSTO(op,i*16+11,ov,parm); \ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*16+12,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*16+13,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VSTO(op,i*16+14,ov,parm); \ + ov = _mm_srli_epi32(iv, 22); VSTO(op,i*16+15,ov,parm); ;\ +} + +#define BITUNPACK128V32_10(ip, op, parm) {\ + BITUNBLK128V32_10(ip, 0, op, parm);\ + BITUNBLK128V32_10(ip, 1, op, parm);\ +} + +#define BITUNBLK128V32_11(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 11),mv); VSTO(op,i*32+ 1,ov,parm); \ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VSTO(op,i*32+ 3,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VSTO(op,i*32+ 4,ov,parm); \ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*32+ 6,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 13),mv); VSTO(op,i*32+ 7,ov,parm); \ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VSTO(op,i*32+ 9,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VSTO(op,i*32+10,ov,parm); \ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*32+12,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 15),mv); VSTO(op,i*32+13,ov,parm); \ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VSTO(op,i*32+15,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VSTO(op,i*32+16,ov,parm); \ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*32+18,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 17),mv); VSTO(op,i*32+19,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VSTO(op,i*32+21,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 18),mv); VSTO(op,i*32+22,ov,parm); \ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*32+24,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 19),mv); VSTO(op,i*32+25,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 9),mv); VSTO(op,i*32+27,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 20),mv); VSTO(op,i*32+28,ov,parm); \ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VSTO(op,i*32+30,ov,parm); \ + ov = _mm_srli_epi32(iv, 21); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACK128V32_11(ip, op, parm) {\ + BITUNBLK128V32_11(ip, 0, op, parm);\ +} + +#define BITUNBLK128V32_12(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv ,mv); VSTO(op,i*8+ 0,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VSTO(op,i*8+ 1,ov,parm); \ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*8+ 2,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*8+ 3,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VSTO(op,i*8+ 4,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*8+ 5,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*8+ 6,ov,parm); \ + ov = _mm_srli_epi32(iv, 20); VSTO(op,i*8+ 7,ov,parm); ;\ +} + +#define BITUNPACK128V32_12(ip, op, parm) {\ + BITUNBLK128V32_12(ip, 0, op, parm);\ + BITUNBLK128V32_12(ip, 1, op, parm);\ + BITUNBLK128V32_12(ip, 2, op, parm);\ + BITUNBLK128V32_12(ip, 3, op, parm);\ +} + +#define BITUNBLK128V32_13(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 13),mv); VSTO(op,i*32+ 1,ov,parm); \ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VSTO(op,i*32+ 3,ov,parm); \ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VSTO(op,i*32+ 5,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VSTO(op,i*32+ 6,ov,parm); \ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*32+ 8,ov,parm); \ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*32+10,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 15),mv); VSTO(op,i*32+11,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 9),mv); VSTO(op,i*32+13,ov,parm); \ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VSTO(op,i*32+15,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VSTO(op,i*32+16,ov,parm); \ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VSTO(op,i*32+18,ov,parm); \ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*32+20,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 17),mv); VSTO(op,i*32+21,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 11),mv); VSTO(op,i*32+23,ov,parm); \ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VSTO(op,i*32+25,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 18),mv); VSTO(op,i*32+26,ov,parm); \ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VSTO(op,i*32+28,ov,parm); \ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*32+30,ov,parm); \ + ov = _mm_srli_epi32(iv, 19); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACK128V32_13(ip, op, parm) {\ + BITUNBLK128V32_13(ip, 0, op, parm);\ +} + +#define BITUNBLK128V32_14(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv ,mv); VSTO(op,i*16+ 0,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VSTO(op,i*16+ 1,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 2,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VSTO(op,i*16+ 3,ov,parm); \ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*16+ 4,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*16+ 5,ov,parm); \ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*16+ 6,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*16+ 7,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VSTO(op,i*16+ 8,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 9,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VSTO(op,i*16+10,ov,parm); \ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*16+11,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*16+12,ov,parm); \ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*16+13,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*16+14,ov,parm); \ + ov = _mm_srli_epi32(iv, 18); VSTO(op,i*16+15,ov,parm); ;\ +} + +#define BITUNPACK128V32_14(ip, op, parm) {\ + BITUNBLK128V32_14(ip, 0, op, parm);\ + BITUNBLK128V32_14(ip, 1, op, parm);\ +} + +#define BITUNBLK128V32_15(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 15),mv); VSTO(op,i*32+ 1,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 13),mv); VSTO(op,i*32+ 3,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 11),mv); VSTO(op,i*32+ 5,ov,parm); \ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 9),mv); VSTO(op,i*32+ 7,ov,parm); \ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VSTO(op,i*32+ 9,ov,parm); \ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VSTO(op,i*32+11,ov,parm); \ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VSTO(op,i*32+13,ov,parm); \ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VSTO(op,i*32+15,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VSTO(op,i*32+16,ov,parm); \ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VSTO(op,i*32+18,ov,parm); \ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VSTO(op,i*32+20,ov,parm); \ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VSTO(op,i*32+22,ov,parm); \ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*32+24,ov,parm); \ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*32+26,ov,parm); \ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*32+28,ov,parm); \ + ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*32+30,ov,parm); \ + ov = _mm_srli_epi32(iv, 17); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACK128V32_15(ip, op, parm) {\ + BITUNBLK128V32_15(ip, 0, op, parm);\ +} + +#define BITUNBLK128V32_16(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv ,mv); VSTO(op,i*2+ 0,ov,parm); \ + ov = _mm_srli_epi32(iv, 16); VSTO(op,i*2+ 1,ov,parm); ;\ +} + +#define BITUNPACK128V32_16(ip, op, parm) {\ + BITUNBLK128V32_16(ip, 0, op, parm);\ + BITUNBLK128V32_16(ip, 1, op, parm);\ + BITUNBLK128V32_16(ip, 2, op, parm);\ + BITUNBLK128V32_16(ip, 3, op, parm);\ + BITUNBLK128V32_16(ip, 4, op, parm);\ + BITUNBLK128V32_16(ip, 5, op, parm);\ + BITUNBLK128V32_16(ip, 6, op, parm);\ + BITUNBLK128V32_16(ip, 7, op, parm);\ + BITUNBLK128V32_16(ip, 8, op, parm);\ + BITUNBLK128V32_16(ip, 9, op, parm);\ + BITUNBLK128V32_16(ip, 10, op, parm);\ + BITUNBLK128V32_16(ip, 11, op, parm);\ + BITUNBLK128V32_16(ip, 12, op, parm);\ + BITUNBLK128V32_16(ip, 13, op, parm);\ + BITUNBLK128V32_16(ip, 14, op, parm);\ + BITUNBLK128V32_16(ip, 15, op, parm);\ +} + +#define BITUNBLK128V32_17(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*32+ 2,ov,parm); \ + ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*32+ 4,ov,parm); \ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*32+ 6,ov,parm); \ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*32+ 8,ov,parm); \ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VSTO(op,i*32+10,ov,parm); \ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VSTO(op,i*32+12,ov,parm); \ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VSTO(op,i*32+14,ov,parm); \ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VSTO(op,i*32+17,ov,parm); \ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VSTO(op,i*32+19,ov,parm); \ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VSTO(op,i*32+21,ov,parm); \ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VSTO(op,i*32+23,ov,parm); \ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 9),mv); VSTO(op,i*32+25,ov,parm); \ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 11),mv); VSTO(op,i*32+27,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 13),mv); VSTO(op,i*32+29,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm_srli_epi32(iv, 15); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACK128V32_17(ip, op, parm) {\ + BITUNBLK128V32_17(ip, 0, op, parm);\ +} + +#define BITUNBLK128V32_18(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv ,mv); VSTO(op,i*16+ 0,ov,parm); \ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*16+ 1,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*16+ 2,ov,parm); \ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 3,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*16+ 4,ov,parm); \ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 5,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VSTO(op,i*16+ 6,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*16+ 9,ov,parm); \ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*16+10,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*16+11,ov,parm); \ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*16+12,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VSTO(op,i*16+13,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); VSTO(op,i*16+15,ov,parm); ;\ +} + +#define BITUNPACK128V32_18(ip, op, parm) {\ + BITUNBLK128V32_18(ip, 0, op, parm);\ + BITUNBLK128V32_18(ip, 1, op, parm);\ +} + +#define BITUNBLK128V32_19(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*32+ 2,ov,parm); \ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VSTO(op,i*32+ 4,ov,parm); \ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VSTO(op,i*32+ 7,ov,parm); \ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 11),mv); VSTO(op,i*32+ 9,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*32+12,ov,parm); \ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VSTO(op,i*32+14,ov,parm); \ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VSTO(op,i*32+17,ov,parm); \ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 9),mv); VSTO(op,i*32+19,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*32+22,ov,parm); \ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*32+24,ov,parm); \ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VSTO(op,i*32+27,ov,parm); \ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VSTO(op,i*32+29,ov,parm); \ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm_srli_epi32(iv, 13); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACK128V32_19(ip, op, parm) {\ + BITUNBLK128V32_19(ip, 0, op, parm);\ +} + +#define BITUNBLK128V32_20(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv ,mv); VSTO(op,i*8+ 0,ov,parm); \ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*8+ 1,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*8+ 2,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*8+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*8+ 4,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*8+ 5,ov,parm); \ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*8+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); VSTO(op,i*8+ 7,ov,parm); ;\ +} + +#define BITUNPACK128V32_20(ip, op, parm) {\ + BITUNBLK128V32_20(ip, 0, op, parm);\ + BITUNBLK128V32_20(ip, 1, op, parm);\ + BITUNBLK128V32_20(ip, 2, op, parm);\ + BITUNBLK128V32_20(ip, 3, op, parm);\ +} + +#define BITUNBLK128V32_21(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VSTO(op,i*32+ 2,ov,parm); \ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 9),mv); VSTO(op,i*32+ 5,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*32+ 8,ov,parm); \ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VSTO(op,i*32+11,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*32+14,ov,parm); \ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VSTO(op,i*32+17,ov,parm); \ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*32+20,ov,parm); \ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VSTO(op,i*32+23,ov,parm); \ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*32+26,ov,parm); \ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VSTO(op,i*32+29,ov,parm); \ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm_srli_epi32(iv, 11); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACK128V32_21(ip, op, parm) {\ + BITUNBLK128V32_21(ip, 0, op, parm);\ +} + +#define BITUNBLK128V32_22(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv ,mv); VSTO(op,i*16+ 0,ov,parm); \ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*16+ 2,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*16+ 3,ov,parm); \ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*16+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*16+ 5,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*16+ 6,ov,parm); \ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*16+ 9,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*16+11,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*16+12,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*16+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); VSTO(op,i*16+15,ov,parm); ;\ +} + +#define BITUNPACK128V32_22(ip, op, parm) {\ + BITUNBLK128V32_22(ip, 0, op, parm);\ + BITUNBLK128V32_22(ip, 1, op, parm);\ +} + +#define BITUNBLK128V32_23(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VSTO(op,i*32+ 3,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VSTO(op,i*32+ 7,ov,parm); \ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*32+10,ov,parm); \ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*32+14,ov,parm); \ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VSTO(op,i*32+17,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VSTO(op,i*32+21,ov,parm); \ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*32+24,ov,parm); \ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*32+28,ov,parm); \ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm_srli_epi32(iv, 9); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACK128V32_23(ip, op, parm) {\ + BITUNBLK128V32_23(ip, 0, op, parm);\ +} + +#define BITUNBLK128V32_24(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv ,mv); VSTO(op,i*4+ 0,ov,parm); \ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*4+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*4+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); VSTO(op,i*4+ 3,ov,parm); ;\ +} + +#define BITUNPACK128V32_24(ip, op, parm) {\ + BITUNBLK128V32_24(ip, 0, op, parm);\ + BITUNBLK128V32_24(ip, 1, op, parm);\ + BITUNBLK128V32_24(ip, 2, op, parm);\ + BITUNBLK128V32_24(ip, 3, op, parm);\ + BITUNBLK128V32_24(ip, 4, op, parm);\ + BITUNBLK128V32_24(ip, 5, op, parm);\ + BITUNBLK128V32_24(ip, 6, op, parm);\ + BITUNBLK128V32_24(ip, 7, op, parm);\ +} + +#define BITUNBLK128V32_25(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*32+ 4,ov,parm); \ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VSTO(op,i*32+ 9,ov,parm); \ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VSTO(op,i*32+13,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*32+18,ov,parm); \ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*32+22,ov,parm); \ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VSTO(op,i*32+27,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm_srli_epi32(iv, 7); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACK128V32_25(ip, op, parm) {\ + BITUNBLK128V32_25(ip, 0, op, parm);\ +} + +#define BITUNBLK128V32_26(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv ,mv); VSTO(op,i*16+ 0,ov,parm); \ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*16+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*16+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*16+ 4,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*16+ 5,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*16+ 9,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*16+10,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*16+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*16+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*16+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 6); VSTO(op,i*16+15,ov,parm); ;\ +} + +#define BITUNPACK128V32_26(ip, op, parm) {\ + BITUNBLK128V32_26(ip, 0, op, parm);\ + BITUNBLK128V32_26(ip, 1, op, parm);\ +} + +#define BITUNBLK128V32_27(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*32+ 6,ov,parm); \ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*32+12,ov,parm); \ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VSTO(op,i*32+19,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VSTO(op,i*32+25,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm_srli_epi32(iv, 5); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACK128V32_27(ip, op, parm) {\ + BITUNBLK128V32_27(ip, 0, op, parm);\ +} + +#define BITUNBLK128V32_28(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv ,mv); VSTO(op,i*8+ 0,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*8+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*8+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*8+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*8+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*8+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*8+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); VSTO(op,i*8+ 7,ov,parm); ;\ +} + +#define BITUNPACK128V32_28(ip, op, parm) {\ + BITUNBLK128V32_28(ip, 0, op, parm);\ + BITUNBLK128V32_28(ip, 1, op, parm);\ + BITUNBLK128V32_28(ip, 2, op, parm);\ + BITUNBLK128V32_28(ip, 3, op, parm);\ +} + +#define BITUNBLK128V32_29(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*32+10,ov,parm); \ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VSTO(op,i*32+21,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm_srli_epi32(iv, 3); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACK128V32_29(ip, op, parm) {\ + BITUNBLK128V32_29(ip, 0, op, parm);\ +} + +#define BITUNBLK128V32_30(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv ,mv); VSTO(op,i*16+ 0,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*16+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*16+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*16+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*16+ 9,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*16+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*16+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*16+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*16+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*16+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 2); VSTO(op,i*16+15,ov,parm); ;\ +} + +#define BITUNPACK128V32_30(ip, op, parm) {\ + BITUNBLK128V32_30(ip, 0, op, parm);\ + BITUNBLK128V32_30(ip, 1, op, parm);\ +} + +#define BITUNBLK128V32_31(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm_srli_epi32(iv, 1); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACK128V32_31(ip, op, parm) {\ + BITUNBLK128V32_31(ip, 0, op, parm);\ +} + +#define BITUNBLK128V32_32(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv ,mv); VSTO(op,i*1+ 0,ov,parm); ;\ +} + +#define BITUNPACK128V32_32(ip, op, parm) {\ + BITUNBLK128V32_32(ip, 0, op, parm);\ + BITUNBLK128V32_32(ip, 1, op, parm);\ + BITUNBLK128V32_32(ip, 2, op, parm);\ + BITUNBLK128V32_32(ip, 3, op, parm);\ + BITUNBLK128V32_32(ip, 4, op, parm);\ + BITUNBLK128V32_32(ip, 5, op, parm);\ + BITUNBLK128V32_32(ip, 6, op, parm);\ + BITUNBLK128V32_32(ip, 7, op, parm);\ + BITUNBLK128V32_32(ip, 8, op, parm);\ + BITUNBLK128V32_32(ip, 9, op, parm);\ + BITUNBLK128V32_32(ip, 10, op, parm);\ + BITUNBLK128V32_32(ip, 11, op, parm);\ + BITUNBLK128V32_32(ip, 12, op, parm);\ + BITUNBLK128V32_32(ip, 13, op, parm);\ + BITUNBLK128V32_32(ip, 14, op, parm);\ + BITUNBLK128V32_32(ip, 15, op, parm);\ + BITUNBLK128V32_32(ip, 16, op, parm);\ + BITUNBLK128V32_32(ip, 17, op, parm);\ + BITUNBLK128V32_32(ip, 18, op, parm);\ + BITUNBLK128V32_32(ip, 19, op, parm);\ + BITUNBLK128V32_32(ip, 20, op, parm);\ + BITUNBLK128V32_32(ip, 21, op, parm);\ + BITUNBLK128V32_32(ip, 22, op, parm);\ + BITUNBLK128V32_32(ip, 23, op, parm);\ + BITUNBLK128V32_32(ip, 24, op, parm);\ + BITUNBLK128V32_32(ip, 25, op, parm);\ + BITUNBLK128V32_32(ip, 26, op, parm);\ + BITUNBLK128V32_32(ip, 27, op, parm);\ + BITUNBLK128V32_32(ip, 28, op, parm);\ + BITUNBLK128V32_32(ip, 29, op, parm);\ + BITUNBLK128V32_32(ip, 30, op, parm);\ + BITUNBLK128V32_32(ip, 31, op, parm);\ +} + +#define BITUNBLK128V32_33(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ + ov = _mm_srli_epi32(iv, 1); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 31), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+31,ov,parm);;\ +} + +#define BITUNPACK128V32_33(ip, op, parm) {\ + BITUNBLK128V32_33(ip, 0, op, parm);\ +} + +#define BITUNBLK128V32_34(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*16+ 0,ov,parm);\ + ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*16+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*16+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*16+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*16+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*16+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*16+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*16+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*16+ 9,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*16+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*16+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*16+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*16+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+15,ov,parm);;\ +} + +#define BITUNPACK128V32_34(ip, op, parm) {\ + BITUNBLK128V32_34(ip, 0, op, parm);\ + BITUNBLK128V32_34(ip, 1, op, parm);\ +} + +#define BITUNBLK128V32_35(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ + ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 1); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 31), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+31,ov,parm);;\ +} + +#define BITUNPACK128V32_35(ip, op, parm) {\ + BITUNBLK128V32_35(ip, 0, op, parm);\ +} + +#define BITUNBLK128V32_36(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*8+ 0,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*8+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*8+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*8+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*8+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*8+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*8+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*8+ 7,ov,parm);;\ +} + +#define BITUNPACK128V32_36(ip, op, parm) {\ + BITUNBLK128V32_36(ip, 0, op, parm);\ + BITUNBLK128V32_36(ip, 1, op, parm);\ + BITUNBLK128V32_36(ip, 2, op, parm);\ + BITUNBLK128V32_36(ip, 3, op, parm);\ +} + +#define BITUNBLK128V32_37(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ + ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 1); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 31), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+31,ov,parm);;\ +} + +#define BITUNPACK128V32_37(ip, op, parm) {\ + BITUNBLK128V32_37(ip, 0, op, parm);\ +} + +#define BITUNBLK128V32_38(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*16+ 0,ov,parm);\ + ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*16+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*16+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*16+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*16+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*16+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*16+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 9,ov,parm);\ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*16+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*16+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*16+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*16+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*16+15,ov,parm);;\ +} + +#define BITUNPACK128V32_38(ip, op, parm) {\ + BITUNBLK128V32_38(ip, 0, op, parm);\ + BITUNBLK128V32_38(ip, 1, op, parm);\ +} + +#define BITUNBLK128V32_39(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ + ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm_srli_epi32(iv, 1); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 31), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+31,ov,parm);;\ +} + +#define BITUNPACK128V32_39(ip, op, parm) {\ + BITUNBLK128V32_39(ip, 0, op, parm);\ +} + +#define BITUNBLK128V32_40(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*4+ 0,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*4+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*4+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*4+ 3,ov,parm);;\ +} + +#define BITUNPACK128V32_40(ip, op, parm) {\ + BITUNBLK128V32_40(ip, 0, op, parm);\ + BITUNBLK128V32_40(ip, 1, op, parm);\ + BITUNBLK128V32_40(ip, 2, op, parm);\ + BITUNBLK128V32_40(ip, 3, op, parm);\ + BITUNBLK128V32_40(ip, 4, op, parm);\ + BITUNBLK128V32_40(ip, 5, op, parm);\ + BITUNBLK128V32_40(ip, 6, op, parm);\ + BITUNBLK128V32_40(ip, 7, op, parm);\ +} + +#define BITUNBLK128V32_41(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ + ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm_srli_epi32(iv, 1); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 31), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+31,ov,parm);;\ +} + +#define BITUNPACK128V32_41(ip, op, parm) {\ + BITUNBLK128V32_41(ip, 0, op, parm);\ +} + +#define BITUNBLK128V32_42(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*16+ 0,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*16+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*16+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*16+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*16+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*16+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 9,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*16+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*16+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*16+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*16+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*16+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*16+15,ov,parm);;\ +} + +#define BITUNPACK128V32_42(ip, op, parm) {\ + BITUNBLK128V32_42(ip, 0, op, parm);\ + BITUNBLK128V32_42(ip, 1, op, parm);\ +} + +#define BITUNBLK128V32_43(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ + ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 1); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 31), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+31,ov,parm);;\ +} + +#define BITUNPACK128V32_43(ip, op, parm) {\ + BITUNBLK128V32_43(ip, 0, op, parm);\ +} + +#define BITUNBLK128V32_44(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*8+ 0,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*8+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*8+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*8+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*8+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*8+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*8+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*8+ 7,ov,parm);;\ +} + +#define BITUNPACK128V32_44(ip, op, parm) {\ + BITUNBLK128V32_44(ip, 0, op, parm);\ + BITUNBLK128V32_44(ip, 1, op, parm);\ + BITUNBLK128V32_44(ip, 2, op, parm);\ + BITUNBLK128V32_44(ip, 3, op, parm);\ +} + +#define BITUNBLK128V32_45(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ + ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 1); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 31), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+31,ov,parm);;\ +} + +#define BITUNPACK128V32_45(ip, op, parm) {\ + BITUNBLK128V32_45(ip, 0, op, parm);\ +} + +#define BITUNBLK128V32_46(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*16+ 0,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*16+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*16+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*16+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*16+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*16+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*16+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 9,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*16+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*16+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*16+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*16+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*16+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*16+15,ov,parm);;\ +} + +#define BITUNPACK128V32_46(ip, op, parm) {\ + BITUNBLK128V32_46(ip, 0, op, parm);\ + BITUNBLK128V32_46(ip, 1, op, parm);\ +} + +#define BITUNBLK128V32_47(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ + ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 1); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 31), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+31,ov,parm);;\ +} + +#define BITUNPACK128V32_47(ip, op, parm) {\ + BITUNBLK128V32_47(ip, 0, op, parm);\ +} + +#define BITUNBLK128V32_48(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*2+ 0,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*2+ 1,ov,parm);;\ +} + +#define BITUNPACK128V32_48(ip, op, parm) {\ + BITUNBLK128V32_48(ip, 0, op, parm);\ + BITUNBLK128V32_48(ip, 1, op, parm);\ + BITUNBLK128V32_48(ip, 2, op, parm);\ + BITUNBLK128V32_48(ip, 3, op, parm);\ + BITUNBLK128V32_48(ip, 4, op, parm);\ + BITUNBLK128V32_48(ip, 5, op, parm);\ + BITUNBLK128V32_48(ip, 6, op, parm);\ + BITUNBLK128V32_48(ip, 7, op, parm);\ + BITUNBLK128V32_48(ip, 8, op, parm);\ + BITUNBLK128V32_48(ip, 9, op, parm);\ + BITUNBLK128V32_48(ip, 10, op, parm);\ + BITUNBLK128V32_48(ip, 11, op, parm);\ + BITUNBLK128V32_48(ip, 12, op, parm);\ + BITUNBLK128V32_48(ip, 13, op, parm);\ + BITUNBLK128V32_48(ip, 14, op, parm);\ + BITUNBLK128V32_48(ip, 15, op, parm);\ +} + +#define BITUNBLK128V32_49(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ + ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm_srli_epi32(iv, 1); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 31), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+31,ov,parm);;\ +} + +#define BITUNPACK128V32_49(ip, op, parm) {\ + BITUNBLK128V32_49(ip, 0, op, parm);\ +} + +#define BITUNBLK128V32_50(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*16+ 0,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*16+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*16+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*16+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*16+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*16+ 9,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*16+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*16+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*16+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*16+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*16+15,ov,parm);;\ +} + +#define BITUNPACK128V32_50(ip, op, parm) {\ + BITUNBLK128V32_50(ip, 0, op, parm);\ + BITUNBLK128V32_50(ip, 1, op, parm);\ +} + +#define BITUNBLK128V32_51(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ + ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm_srli_epi32(iv, 1); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 31), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+31,ov,parm);;\ +} + +#define BITUNPACK128V32_51(ip, op, parm) {\ + BITUNBLK128V32_51(ip, 0, op, parm);\ +} + +#define BITUNBLK128V32_52(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*8+ 0,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*8+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*8+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*8+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*8+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*8+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*8+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*8+ 7,ov,parm);;\ +} + +#define BITUNPACK128V32_52(ip, op, parm) {\ + BITUNBLK128V32_52(ip, 0, op, parm);\ + BITUNBLK128V32_52(ip, 1, op, parm);\ + BITUNBLK128V32_52(ip, 2, op, parm);\ + BITUNBLK128V32_52(ip, 3, op, parm);\ +} + +#define BITUNBLK128V32_53(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm_srli_epi32(iv, 1); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 31), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+31,ov,parm);;\ +} + +#define BITUNPACK128V32_53(ip, op, parm) {\ + BITUNBLK128V32_53(ip, 0, op, parm);\ +} + +#define BITUNBLK128V32_54(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*16+ 0,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*16+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*16+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*16+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*16+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*16+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*16+ 9,ov,parm);\ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*16+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*16+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*16+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*16+15,ov,parm);;\ +} + +#define BITUNPACK128V32_54(ip, op, parm) {\ + BITUNBLK128V32_54(ip, 0, op, parm);\ + BITUNBLK128V32_54(ip, 1, op, parm);\ +} + +#define BITUNBLK128V32_55(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 1); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 31), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+31,ov,parm);;\ +} + +#define BITUNPACK128V32_55(ip, op, parm) {\ + BITUNBLK128V32_55(ip, 0, op, parm);\ +} + +#define BITUNBLK128V32_56(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*4+ 0,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*4+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*4+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*4+ 3,ov,parm);;\ +} + +#define BITUNPACK128V32_56(ip, op, parm) {\ + BITUNBLK128V32_56(ip, 0, op, parm);\ + BITUNBLK128V32_56(ip, 1, op, parm);\ + BITUNBLK128V32_56(ip, 2, op, parm);\ + BITUNBLK128V32_56(ip, 3, op, parm);\ + BITUNBLK128V32_56(ip, 4, op, parm);\ + BITUNBLK128V32_56(ip, 5, op, parm);\ + BITUNBLK128V32_56(ip, 6, op, parm);\ + BITUNBLK128V32_56(ip, 7, op, parm);\ +} + +#define BITUNBLK128V32_57(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 1); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 31), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+31,ov,parm);;\ +} + +#define BITUNPACK128V32_57(ip, op, parm) {\ + BITUNBLK128V32_57(ip, 0, op, parm);\ +} + +#define BITUNBLK128V32_58(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*16+ 0,ov,parm);\ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*16+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*16+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*16+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*16+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*16+ 9,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*16+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*16+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*16+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*16+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*16+15,ov,parm);;\ +} + +#define BITUNPACK128V32_58(ip, op, parm) {\ + BITUNBLK128V32_58(ip, 0, op, parm);\ + BITUNBLK128V32_58(ip, 1, op, parm);\ +} + +#define BITUNBLK128V32_59(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm_srli_epi32(iv, 1); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 31), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+31,ov,parm);;\ +} + +#define BITUNPACK128V32_59(ip, op, parm) {\ + BITUNBLK128V32_59(ip, 0, op, parm);\ +} + +#define BITUNBLK128V32_60(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*8+ 0,ov,parm);\ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*8+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*8+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*8+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*8+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*8+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*8+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*8+ 7,ov,parm);;\ +} + +#define BITUNPACK128V32_60(ip, op, parm) {\ + BITUNBLK128V32_60(ip, 0, op, parm);\ + BITUNBLK128V32_60(ip, 1, op, parm);\ + BITUNBLK128V32_60(ip, 2, op, parm);\ + BITUNBLK128V32_60(ip, 3, op, parm);\ +} + +#define BITUNBLK128V32_61(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm_srli_epi32(iv, 1); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 31), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VSTO(op,i*32+31,ov,parm);;\ +} + +#define BITUNPACK128V32_61(ip, op, parm) {\ + BITUNBLK128V32_61(ip, 0, op, parm);\ +} + +#define BITUNBLK128V32_62(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*16+ 0,ov,parm);\ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*16+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*16+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*16+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*16+ 9,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*16+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*16+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*16+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*16+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*16+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*16+15,ov,parm);;\ +} + +#define BITUNPACK128V32_62(ip, op, parm) {\ + BITUNBLK128V32_62(ip, 0, op, parm);\ + BITUNBLK128V32_62(ip, 1, op, parm);\ +} + +#define BITUNBLK128V32_63(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm_srli_epi32(iv, 1); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 31), mv)); VSTO(op,i*32+31,ov,parm);;\ +} + +#define BITUNPACK128V32_63(ip, op, parm) {\ + BITUNBLK128V32_63(ip, 0, op, parm);\ +} + +#define BITUNBLK128V32_64(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*1+ 0,ov,parm);;\ +} + +#define BITUNPACK128V32_64(ip, op, parm) {\ + BITUNBLK128V32_64(ip, 0, op, parm);\ + BITUNBLK128V32_64(ip, 1, op, parm);\ + BITUNBLK128V32_64(ip, 2, op, parm);\ + BITUNBLK128V32_64(ip, 3, op, parm);\ + BITUNBLK128V32_64(ip, 4, op, parm);\ + BITUNBLK128V32_64(ip, 5, op, parm);\ + BITUNBLK128V32_64(ip, 6, op, parm);\ + BITUNBLK128V32_64(ip, 7, op, parm);\ + BITUNBLK128V32_64(ip, 8, op, parm);\ + BITUNBLK128V32_64(ip, 9, op, parm);\ + BITUNBLK128V32_64(ip, 10, op, parm);\ + BITUNBLK128V32_64(ip, 11, op, parm);\ + BITUNBLK128V32_64(ip, 12, op, parm);\ + BITUNBLK128V32_64(ip, 13, op, parm);\ + BITUNBLK128V32_64(ip, 14, op, parm);\ + BITUNBLK128V32_64(ip, 15, op, parm);\ + BITUNBLK128V32_64(ip, 16, op, parm);\ + BITUNBLK128V32_64(ip, 17, op, parm);\ + BITUNBLK128V32_64(ip, 18, op, parm);\ + BITUNBLK128V32_64(ip, 19, op, parm);\ + BITUNBLK128V32_64(ip, 20, op, parm);\ + BITUNBLK128V32_64(ip, 21, op, parm);\ + BITUNBLK128V32_64(ip, 22, op, parm);\ + BITUNBLK128V32_64(ip, 23, op, parm);\ + BITUNBLK128V32_64(ip, 24, op, parm);\ + BITUNBLK128V32_64(ip, 25, op, parm);\ + BITUNBLK128V32_64(ip, 26, op, parm);\ + BITUNBLK128V32_64(ip, 27, op, parm);\ + BITUNBLK128V32_64(ip, 28, op, parm);\ + BITUNBLK128V32_64(ip, 29, op, parm);\ + BITUNBLK128V32_64(ip, 30, op, parm);\ + BITUNBLK128V32_64(ip, 31, op, parm);\ +} + diff --git a/bitunpack256v.c b/bitunpack256v.c new file mode 100644 index 0000000..f55333d --- /dev/null +++ b/bitunpack256v.c @@ -0,0 +1,500 @@ +/** + Copyright (C) powturbo 2013-2017 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - homepage : https://sites.google.com/site/powturbo/ + - github : https://github.com/powturbo + - twitter : https://twitter.com/powturbo + - email : powturbo [_AT_] gmail [_DOT_] com +**/ +// "Integer Compression" SIMD Bit Packing + #ifndef VSTO +#include + +#include +#include +#include "conf.h" +#include "bitutil.h" +#include "bitunpack.h" + + #ifdef __AVX512F__ +#include +#define mm256_maskz_expand_epi32(_m_,_v_) _mm256_maskz_expand_epi32(_m_,_v_) +#define mm256_maskz_loadu_epi32( _m_,_v_) _mm256_maskz_loadu_epi32( _m_,_v_) + #else +#include +static unsigned char shuffles[256][8] __attribute__((aligned(32))) = { +0,0,0,0,0,0,0,0, +0,1,1,1,1,1,1,1, +1,0,1,1,1,1,1,1, +0,1,2,2,2,2,2,2, +1,1,0,1,1,1,1,1, +0,2,1,2,2,2,2,2, +2,0,1,2,2,2,2,2, +0,1,2,3,3,3,3,3, +1,1,1,0,1,1,1,1, +0,2,2,1,2,2,2,2, +2,0,2,1,2,2,2,2, +0,1,3,2,3,3,3,3, +2,2,0,1,2,2,2,2, +0,3,1,2,3,3,3,3, +3,0,1,2,3,3,3,3, +0,1,2,3,4,4,4,4, +1,1,1,1,0,1,1,1, +0,2,2,2,1,2,2,2, +2,0,2,2,1,2,2,2, +0,1,3,3,2,3,3,3, +2,2,0,2,1,2,2,2, +0,3,1,3,2,3,3,3, +3,0,1,3,2,3,3,3, +0,1,2,4,3,4,4,4, +2,2,2,0,1,2,2,2, +0,3,3,1,2,3,3,3, +3,0,3,1,2,3,3,3, +0,1,4,2,3,4,4,4, +3,3,0,1,2,3,3,3, +0,4,1,2,3,4,4,4, +4,0,1,2,3,4,4,4, +0,1,2,3,4,5,5,5, +1,1,1,1,1,0,1,1, +0,2,2,2,2,1,2,2, +2,0,2,2,2,1,2,2, +0,1,3,3,3,2,3,3, +2,2,0,2,2,1,2,2, +0,3,1,3,3,2,3,3, +3,0,1,3,3,2,3,3, +0,1,2,4,4,3,4,4, +2,2,2,0,2,1,2,2, +0,3,3,1,3,2,3,3, +3,0,3,1,3,2,3,3, +0,1,4,2,4,3,4,4, +3,3,0,1,3,2,3,3, +0,4,1,2,4,3,4,4, +4,0,1,2,4,3,4,4, +0,1,2,3,5,4,5,5, +2,2,2,2,0,1,2,2, +0,3,3,3,1,2,3,3, +3,0,3,3,1,2,3,3, +0,1,4,4,2,3,4,4, +3,3,0,3,1,2,3,3, +0,4,1,4,2,3,4,4, +4,0,1,4,2,3,4,4, +0,1,2,5,3,4,5,5, +3,3,3,0,1,2,3,3, +0,4,4,1,2,3,4,4, +4,0,4,1,2,3,4,4, +0,1,5,2,3,4,5,5, +4,4,0,1,2,3,4,4, +0,5,1,2,3,4,5,5, +5,0,1,2,3,4,5,5, +0,1,2,3,4,5,6,6, +1,1,1,1,1,1,0,1, +0,2,2,2,2,2,1,2, +2,0,2,2,2,2,1,2, +0,1,3,3,3,3,2,3, +2,2,0,2,2,2,1,2, +0,3,1,3,3,3,2,3, +3,0,1,3,3,3,2,3, +0,1,2,4,4,4,3,4, +2,2,2,0,2,2,1,2, +0,3,3,1,3,3,2,3, +3,0,3,1,3,3,2,3, +0,1,4,2,4,4,3,4, +3,3,0,1,3,3,2,3, +0,4,1,2,4,4,3,4, +4,0,1,2,4,4,3,4, +0,1,2,3,5,5,4,5, +2,2,2,2,0,2,1,2, +0,3,3,3,1,3,2,3, +3,0,3,3,1,3,2,3, +0,1,4,4,2,4,3,4, +3,3,0,3,1,3,2,3, +0,4,1,4,2,4,3,4, +4,0,1,4,2,4,3,4, +0,1,2,5,3,5,4,5, +3,3,3,0,1,3,2,3, +0,4,4,1,2,4,3,4, +4,0,4,1,2,4,3,4, +0,1,5,2,3,5,4,5, +4,4,0,1,2,4,3,4, +0,5,1,2,3,5,4,5, +5,0,1,2,3,5,4,5, +0,1,2,3,4,6,5,6, +2,2,2,2,2,0,1,2, +0,3,3,3,3,1,2,3, +3,0,3,3,3,1,2,3, +0,1,4,4,4,2,3,4, +3,3,0,3,3,1,2,3, +0,4,1,4,4,2,3,4, +4,0,1,4,4,2,3,4, +0,1,2,5,5,3,4,5, +3,3,3,0,3,1,2,3, +0,4,4,1,4,2,3,4, +4,0,4,1,4,2,3,4, +0,1,5,2,5,3,4,5, +4,4,0,1,4,2,3,4, +0,5,1,2,5,3,4,5, +5,0,1,2,5,3,4,5, +0,1,2,3,6,4,5,6, +3,3,3,3,0,1,2,3, +0,4,4,4,1,2,3,4, +4,0,4,4,1,2,3,4, +0,1,5,5,2,3,4,5, +4,4,0,4,1,2,3,4, +0,5,1,5,2,3,4,5, +5,0,1,5,2,3,4,5, +0,1,2,6,3,4,5,6, +4,4,4,0,1,2,3,4, +0,5,5,1,2,3,4,5, +5,0,5,1,2,3,4,5, +0,1,6,2,3,4,5,6, +5,5,0,1,2,3,4,5, +0,6,1,2,3,4,5,6, +6,0,1,2,3,4,5,6, +0,1,2,3,4,5,6,7, +1,1,1,1,1,1,1,0, +0,2,2,2,2,2,2,1, +2,0,2,2,2,2,2,1, +0,1,3,3,3,3,3,2, +2,2,0,2,2,2,2,1, +0,3,1,3,3,3,3,2, +3,0,1,3,3,3,3,2, +0,1,2,4,4,4,4,3, +2,2,2,0,2,2,2,1, +0,3,3,1,3,3,3,2, +3,0,3,1,3,3,3,2, +0,1,4,2,4,4,4,3, +3,3,0,1,3,3,3,2, +0,4,1,2,4,4,4,3, +4,0,1,2,4,4,4,3, +0,1,2,3,5,5,5,4, +2,2,2,2,0,2,2,1, +0,3,3,3,1,3,3,2, +3,0,3,3,1,3,3,2, +0,1,4,4,2,4,4,3, +3,3,0,3,1,3,3,2, +0,4,1,4,2,4,4,3, +4,0,1,4,2,4,4,3, +0,1,2,5,3,5,5,4, +3,3,3,0,1,3,3,2, +0,4,4,1,2,4,4,3, +4,0,4,1,2,4,4,3, +0,1,5,2,3,5,5,4, +4,4,0,1,2,4,4,3, +0,5,1,2,3,5,5,4, +5,0,1,2,3,5,5,4, +0,1,2,3,4,6,6,5, +2,2,2,2,2,0,2,1, +0,3,3,3,3,1,3,2, +3,0,3,3,3,1,3,2, +0,1,4,4,4,2,4,3, +3,3,0,3,3,1,3,2, +0,4,1,4,4,2,4,3, +4,0,1,4,4,2,4,3, +0,1,2,5,5,3,5,4, +3,3,3,0,3,1,3,2, +0,4,4,1,4,2,4,3, +4,0,4,1,4,2,4,3, +0,1,5,2,5,3,5,4, +4,4,0,1,4,2,4,3, +0,5,1,2,5,3,5,4, +5,0,1,2,5,3,5,4, +0,1,2,3,6,4,6,5, +3,3,3,3,0,1,3,2, +0,4,4,4,1,2,4,3, +4,0,4,4,1,2,4,3, +0,1,5,5,2,3,5,4, +4,4,0,4,1,2,4,3, +0,5,1,5,2,3,5,4, +5,0,1,5,2,3,5,4, +0,1,2,6,3,4,6,5, +4,4,4,0,1,2,4,3, +0,5,5,1,2,3,5,4, +5,0,5,1,2,3,5,4, +0,1,6,2,3,4,6,5, +5,5,0,1,2,3,5,4, +0,6,1,2,3,4,6,5, +6,0,1,2,3,4,6,5, +0,1,2,3,4,5,7,6, +2,2,2,2,2,2,0,1, +0,3,3,3,3,3,1,2, +3,0,3,3,3,3,1,2, +0,1,4,4,4,4,2,3, +3,3,0,3,3,3,1,2, +0,4,1,4,4,4,2,3, +4,0,1,4,4,4,2,3, +0,1,2,5,5,5,3,4, +3,3,3,0,3,3,1,2, +0,4,4,1,4,4,2,3, +4,0,4,1,4,4,2,3, +0,1,5,2,5,5,3,4, +4,4,0,1,4,4,2,3, +0,5,1,2,5,5,3,4, +5,0,1,2,5,5,3,4, +0,1,2,3,6,6,4,5, +3,3,3,3,0,3,1,2, +0,4,4,4,1,4,2,3, +4,0,4,4,1,4,2,3, +0,1,5,5,2,5,3,4, +4,4,0,4,1,4,2,3, +0,5,1,5,2,5,3,4, +5,0,1,5,2,5,3,4, +0,1,2,6,3,6,4,5, +4,4,4,0,1,4,2,3, +0,5,5,1,2,5,3,4, +5,0,5,1,2,5,3,4, +0,1,6,2,3,6,4,5, +5,5,0,1,2,5,3,4, +0,6,1,2,3,6,4,5, +6,0,1,2,3,6,4,5, +0,1,2,3,4,7,5,6, +3,3,3,3,3,0,1,2, +0,4,4,4,4,1,2,3, +4,0,4,4,4,1,2,3, +0,1,5,5,5,2,3,4, +4,4,0,4,4,1,2,3, +0,5,1,5,5,2,3,4, +5,0,1,5,5,2,3,4, +0,1,2,6,6,3,4,5, +4,4,4,0,4,1,2,3, +0,5,5,1,5,2,3,4, +5,0,5,1,5,2,3,4, +0,1,6,2,6,3,4,5, +5,5,0,1,5,2,3,4, +0,6,1,2,6,3,4,5, +6,0,1,2,6,3,4,5, +0,1,2,3,7,4,5,6, +4,4,4,4,0,1,2,3, +0,5,5,5,1,2,3,4, +5,0,5,5,1,2,3,4, +0,1,6,6,2,3,4,5, +5,5,0,5,1,2,3,4, +0,6,1,6,2,3,4,5, +6,0,1,6,2,3,4,5, +0,1,2,7,3,4,5,6, +5,5,5,0,1,2,3,4, +0,6,6,1,2,3,4,5, +6,0,6,1,2,3,4,5, +0,1,7,2,3,4,5,6, +6,6,0,1,2,3,4,5, +0,7,1,2,3,4,5,6, +7,0,1,2,3,4,5,6, +0,1,2,3,4,5,6,7 +}; +#define u2vmask(_m_,_tv_) _mm256_sllv_epi32(_mm256_set1_epi8(_m_), _tv_) +#define mm256_maskz_expand_epi32(_m_, _v_) _mm256_permutevar8x32_epi32(_v_, _mm256_cvtepu8_epi32(_mm_cvtsi64_si128(ctou64(shuffles[_m_]))) ) +#define mm256_maskz_loadu_epi32(_m_,_v_) _mm256_blendv_epi8(zv, mm256_maskz_expand_epi32(xm, _mm256_loadu_si256((__m256i*)pex)), u2vmask(xm,tv)) + #endif + +#define PAD8(__x) (((__x)+7)/8) + +//----------------------------------------------------------------------------- +#define VSTO( _op_, _i_, ov, _parm_) _mm256_storeu_si256(_op_++, ov) +#define VSTO0(_op_, _i_, ov, _parm_) _mm256_storeu_si256(_op_++, _parm_) +#include "bitunpack256v.c" + +#define BITUNBLK256V32_0(ip, _i_, _op_, _parm_) {__m256i ov;\ + VSTO0(_op_, 0, ov, _parm_);\ + VSTO0(_op_, 1, ov, _parm_);\ + VSTO0(_op_, 2, ov, _parm_);\ + VSTO0(_op_, 3, ov, _parm_);\ + VSTO0(_op_, 4, ov, _parm_);\ + VSTO0(_op_, 5, ov, _parm_);\ + VSTO0(_op_, 6, ov, _parm_);\ + VSTO0(_op_, 7, ov, _parm_);\ + VSTO0(_op_, 8, ov, _parm_);\ + VSTO0(_op_, 9, ov, _parm_);\ + VSTO0(_op_, 10, ov, _parm_);\ + VSTO0(_op_, 11, ov, _parm_);\ + VSTO0(_op_, 12, ov, _parm_);\ + VSTO0(_op_, 13, ov, _parm_);\ + VSTO0(_op_, 14, ov, _parm_);\ + VSTO0(_op_, 15, ov, _parm_);\ + VSTO0(_op_, 16, ov, _parm_);\ + VSTO0(_op_, 17, ov, _parm_);\ + VSTO0(_op_, 18, ov, _parm_);\ + VSTO0(_op_, 19, ov, _parm_);\ + VSTO0(_op_, 20, ov, _parm_);\ + VSTO0(_op_, 21, ov, _parm_);\ + VSTO0(_op_, 22, ov, _parm_);\ + VSTO0(_op_, 23, ov, _parm_);\ + VSTO0(_op_, 24, ov, _parm_);\ + VSTO0(_op_, 25, ov, _parm_);\ + VSTO0(_op_, 26, ov, _parm_);\ + VSTO0(_op_, 27, ov, _parm_);\ + VSTO0(_op_, 28, ov, _parm_);\ + VSTO0(_op_, 29, ov, _parm_);\ + VSTO0(_op_, 30, ov, _parm_);\ + VSTO0(_op_, 31, ov, _parm_);\ +} +#define BITUNPACK0(_parm_) _parm_ = _mm256_setzero_si256() + +unsigned char *bitunpack256v32( const unsigned char *__restrict in, unsigned *__restrict out, unsigned b) { + const unsigned char *ip = in+PAD8(256*b); + __m256i sv; + BITUNPACK256V32(in, b, out, sv); + return (unsigned char *)ip; +} +#undef VSTO +#undef VSTO0 +#undef BITUNPACK0 + +//--------------------------------------- zeromask unpack for TurboPFor vp4d.c -------------------------------------- +#define VSTO(_op_, _i_, _ov_, _parm_) xm = *bb++; _mm256_storeu_si256(_op_++, _mm256_add_epi32(_ov_, _mm256_slli_epi32(mm256_maskz_loadu_epi32(xm,(__m256i*)pex), b) )); pex += popcnt32(xm) +#define VSTO0(_op_, _i_, _ov_, _parm_) xm = *bb++; _mm256_storeu_si256(_op_++, mm256_maskz_loadu_epi32(xm,(__m256i*)pex) ); pex += popcnt32(xm) +#define BITUNPACK0(_parm_) +#include "bitunpack256v.c" + +unsigned char *_bitunpack256v32( const unsigned char *__restrict in, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb) { + const unsigned char *ip = in+PAD8(256*b); unsigned xm; __m256i sv, zv = _mm256_setzero_si256(), tv = _mm256_set_epi32(0,1,2,3,4,5,6,7); + BITUNPACK256V32(in, b, out, sv); + return (unsigned char *)ip; +} +#undef VSTO +#undef VSTO0 +#undef BITUNPACK0 +//-------------------------------- +#define VSTO0(_op_, _i_, ov, _parm_) _mm256_storeu_si256(_op_++, _parm_) +#define VSTO(__op, i, __ov, __sv) __ov = UNZIGZAG256x32(__ov); SCAN256x32(__ov,__sv); _mm256_storeu_si256(__op++, __sv) +#include "bitunpack256v.c" + +#define BITUNPACK0(_parm_) + +unsigned char *bitzunpack256v32( const unsigned char *__restrict in, unsigned *__restrict out, unsigned start, unsigned b) { + const unsigned char *ip = in+PAD8(256*b); + __m256i sv = _mm256_set1_epi32(start), zv = _mm256_setzero_si256(); + BITUNPACK256V32(in, b, out, sv); + return (unsigned char *)ip; +} +#undef VSTO +#undef BITUNPACK0 + +//----------------------------------------------------------------------------- +#define VSTO(__op, i, __ov, __sv) SCAN256x32(__ov,__sv); _mm256_storeu_si256(__op++, __sv) +#include "bitunpack256v.c" + +#define BITUNPACK0(_parm_) + +unsigned char *bitdunpack256v32( const unsigned char *__restrict in, unsigned *__restrict out, unsigned start, unsigned b) { + const unsigned char *ip = in+PAD8(256*b); + __m256i sv = _mm256_set1_epi32(start), zv = _mm256_setzero_si256(); + BITUNPACK256V32(in, b, out, sv); + return (unsigned char *)ip; +} +#undef VSTO +#undef VSTO0 +#undef BITUNPACK0 + +//----------------------------------------------------------------------------- +#define VEXP(_i_, _ov_) xm = *bb++; _ov_ = _mm256_add_epi32(_ov_, _mm256_slli_epi32(mm256_maskz_loadu_epi32(xm,(__m256i*)pex), b) ); pex += popcnt32(xm) +#define VEXP0(_i_, _ov_) xm = *bb++; _ov_ = mm256_maskz_loadu_epi32(xm,(__m256i*)pex); pex += popcnt32(xm) + +#define VSTO( _op_, _i_, _ov_, _sv_) VEXP( _i_, _ov_); SCAN256x32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_); +#define VSTO0(_op_, _i_, _ov_, _sv_) VEXP0(_i_, _ov_); SCAN256x32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_); + +#include "bitunpack256v.c" + +#define BITUNPACK0(_parm_) + +unsigned char *_bitdunpack256v32( const unsigned char *__restrict in, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) { + const unsigned char *ip = in+PAD8(256*b); unsigned xm; + __m256i sv = _mm256_set1_epi32(start),zv = _mm256_setzero_si256(), tv = _mm256_set_epi32(0,1,2,3,4,5,6,7); + BITUNPACK256V32(in, b, out, sv); + return (unsigned char *)ip; +} +#undef VSTO +#undef VSTO0 +#undef BITUNPACK0 + +//----------------------------------------------------------------------------- +#define VSTO(__op, i, __ov, __sv) SCANI256x32(__ov,__sv,cv); _mm256_storeu_si256(__op++, __sv); +#define VSTO0(_op_, _i_, ov, _parm_) _mm256_storeu_si256(_op_++, _parm_); _parm_ = _mm256_add_epi32(_parm_, cv) +#include "bitunpack256v.c" + +#define BITUNPACK0(_parm_) _parm_ = _mm256_add_epi32(_parm_, cv); cv = _mm256_set1_epi32(8) + +unsigned char *bitd1unpack256v32( const unsigned char *__restrict in, unsigned *__restrict out, unsigned start, unsigned b) { + const unsigned char *ip = in+PAD8(256*b); + __m256i sv = _mm256_set1_epi32(start), cv = _mm256_set_epi32(8,7,6,5,4,3,2,1),zv = _mm256_setzero_si256(); + BITUNPACK256V32(in, b, out, sv); + return (unsigned char *)ip; +} +#undef VSTO +#undef VSTO0 +#undef BITUNPACK0 +//----------------------------------------------------------------------------- +#define VSTO( _op_, _i_, _ov_, _sv_) VEXP( _i_, _ov_); SCANI256x32(_ov_,_sv_,cv); _mm256_storeu_si256(_op_++, _sv_); +#define VSTO0(_op_, _i_, _ov_, _sv_) VEXP0(_i_, _ov_); SCANI256x32(_ov_,_sv_,cv); _mm256_storeu_si256(_op_++, _sv_); + +#include "bitunpack256v.c" + +#define BITUNPACK0(_parm_) mv = _mm256_set1_epi32(0) //_parm_ = _mm_setzero_si128() + +unsigned char *_bitd1unpack256v32( const unsigned char *__restrict in, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) { + const unsigned char *ip = in+PAD8(256*b); unsigned xm; + __m256i sv = _mm256_set1_epi32(start), cv = _mm256_set_epi32(8,7,6,5,4,3,2,1),zv = _mm256_setzero_si256(),tv = _mm256_set_epi32(0,1,2,3,4,5,6,7); + BITUNPACK256V32(in, b, out, sv); + return (unsigned char *)ip; +} +#undef VSTO +#undef VSTO0 +#undef BITUNPACK0 + + #else +#include "bitunpack256v_.h" + +#define BITUNPACK256V32(__ip, __nbits, __op, _parm_) { __m256i mv,*_ov=(__m256i *)__op,*_iv=(__m256i *)__ip; \ + switch(__nbits&0x3f) {\ + case 0: BITUNPACK0(_parm_); BITUNPACK256V32_0( _iv, _ov, _parm_); break;\ + case 1: mv = _mm256_set1_epi32((1u<< 1)-1); BITUNPACK256V32_1( _iv, _ov, _parm_); break;\ + case 2: mv = _mm256_set1_epi32((1u<< 2)-1); BITUNPACK256V32_2( _iv, _ov, _parm_); break;\ + case 3: mv = _mm256_set1_epi32((1u<< 3)-1); BITUNPACK256V32_3( _iv, _ov, _parm_); break;\ + case 4: mv = _mm256_set1_epi32((1u<< 4)-1); BITUNPACK256V32_4( _iv, _ov, _parm_); break;\ + case 5: mv = _mm256_set1_epi32((1u<< 5)-1); BITUNPACK256V32_5( _iv, _ov, _parm_); break;\ + case 6: mv = _mm256_set1_epi32((1u<< 6)-1); BITUNPACK256V32_6( _iv, _ov, _parm_); break;\ + case 7: mv = _mm256_set1_epi32((1u<< 7)-1); BITUNPACK256V32_7( _iv, _ov, _parm_); break;\ + case 8: mv = _mm256_set1_epi32((1u<< 8)-1); BITUNPACK256V32_8( _iv, _ov, _parm_); break;\ + case 9: mv = _mm256_set1_epi32((1u<< 9)-1); BITUNPACK256V32_9( _iv, _ov, _parm_); break;\ + case 10: mv = _mm256_set1_epi32((1u<<10)-1); BITUNPACK256V32_10(_iv, _ov, _parm_); break;\ + case 11: mv = _mm256_set1_epi32((1u<<11)-1); BITUNPACK256V32_11(_iv, _ov, _parm_); break;\ + case 12: mv = _mm256_set1_epi32((1u<<12)-1); BITUNPACK256V32_12(_iv, _ov, _parm_); break;\ + case 13: mv = _mm256_set1_epi32((1u<<13)-1); BITUNPACK256V32_13(_iv, _ov, _parm_); break;\ + case 14: mv = _mm256_set1_epi32((1u<<14)-1); BITUNPACK256V32_14(_iv, _ov, _parm_); break;\ + case 15: mv = _mm256_set1_epi32((1u<<15)-1); BITUNPACK256V32_15(_iv, _ov, _parm_); break;\ + case 16: mv = _mm256_set1_epi32((1u<<16)-1); BITUNPACK256V32_16(_iv, _ov, _parm_); break;\ + case 17: mv = _mm256_set1_epi32((1u<<17)-1); BITUNPACK256V32_17(_iv, _ov, _parm_); break;\ + case 18: mv = _mm256_set1_epi32((1u<<18)-1); BITUNPACK256V32_18(_iv, _ov, _parm_); break;\ + case 19: mv = _mm256_set1_epi32((1u<<19)-1); BITUNPACK256V32_19(_iv, _ov, _parm_); break;\ + case 20: mv = _mm256_set1_epi32((1u<<20)-1); BITUNPACK256V32_20(_iv, _ov, _parm_); break;\ + case 21: mv = _mm256_set1_epi32((1u<<21)-1); BITUNPACK256V32_21(_iv, _ov, _parm_); break;\ + case 22: mv = _mm256_set1_epi32((1u<<22)-1); BITUNPACK256V32_22(_iv, _ov, _parm_); break;\ + case 23: mv = _mm256_set1_epi32((1u<<23)-1); BITUNPACK256V32_23(_iv, _ov, _parm_); break;\ + case 24: mv = _mm256_set1_epi32((1u<<24)-1); BITUNPACK256V32_24(_iv, _ov, _parm_); break;\ + case 25: mv = _mm256_set1_epi32((1u<<25)-1); BITUNPACK256V32_25(_iv, _ov, _parm_); break;\ + case 26: mv = _mm256_set1_epi32((1u<<26)-1); BITUNPACK256V32_26(_iv, _ov, _parm_); break;\ + case 27: mv = _mm256_set1_epi32((1u<<27)-1); BITUNPACK256V32_27(_iv, _ov, _parm_); break;\ + case 28: mv = _mm256_set1_epi32((1u<<28)-1); BITUNPACK256V32_28(_iv, _ov, _parm_); break;\ + case 29: mv = _mm256_set1_epi32((1u<<29)-1); BITUNPACK256V32_29(_iv, _ov, _parm_); break;\ + case 30: mv = _mm256_set1_epi32((1u<<30)-1); BITUNPACK256V32_30(_iv, _ov, _parm_); break;\ + case 31: mv = _mm256_set1_epi32((1u<<31)-1); BITUNPACK256V32_31(_iv, _ov, _parm_); break;\ + case 32: mv = _mm256_set1_epi32((1ull<<32)-1);BITUNPACK256V32_32(_iv, _ov, _parm_); break;\ + case 33 ... 63: break;\ + }\ +} + #endif + + diff --git a/bitunpack256v_.h b/bitunpack256v_.h new file mode 100644 index 0000000..0b3f350 --- /dev/null +++ b/bitunpack256v_.h @@ -0,0 +1,2002 @@ +/** + Copyright (C) powturbo 2013-2017 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - homepage : https://sites.google.com/site/powturbo/ + - github : https://github.com/powturbo + - twitter : https://twitter.com/powturbo + - email : powturbo [_AT_] gmail [_DOT_] com +**/ +// TurboPFor: Integer Compression SIMD bit unpacking +#define BITUNPACK256V32_0(ip, op, parm) {\ + BITUNBLK256V32_0(ip, 0, op, parm);\ +} + +#define BITUNBLK256V32_1(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_and_si256( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 1),mv); VSTO(op,i*32+ 1,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*32+ 2,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 3),mv); VSTO(op,i*32+ 3,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*32+ 4,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 5),mv); VSTO(op,i*32+ 5,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*32+ 6,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 7),mv); VSTO(op,i*32+ 7,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*32+ 8,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 9),mv); VSTO(op,i*32+ 9,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 10),mv); VSTO(op,i*32+10,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 11),mv); VSTO(op,i*32+11,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 12),mv); VSTO(op,i*32+12,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 13),mv); VSTO(op,i*32+13,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 14),mv); VSTO(op,i*32+14,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 15),mv); VSTO(op,i*32+15,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 16),mv); VSTO(op,i*32+16,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 17),mv); VSTO(op,i*32+17,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 18),mv); VSTO(op,i*32+18,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 19),mv); VSTO(op,i*32+19,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 20),mv); VSTO(op,i*32+20,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 21),mv); VSTO(op,i*32+21,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 22),mv); VSTO(op,i*32+22,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 23),mv); VSTO(op,i*32+23,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 24),mv); VSTO(op,i*32+24,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 25),mv); VSTO(op,i*32+25,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 26),mv); VSTO(op,i*32+26,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 27),mv); VSTO(op,i*32+27,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 28),mv); VSTO(op,i*32+28,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 29),mv); VSTO(op,i*32+29,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 30),mv); VSTO(op,i*32+30,ov,parm); \ + ov = _mm256_srli_epi32(iv, 31); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACK256V32_1(ip, op, parm) {\ + BITUNBLK256V32_1(ip, 0, op, parm);\ +} + +#define BITUNBLK256V32_2(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_and_si256( iv ,mv); VSTO(op,i*16+ 0,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*16+ 1,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*16+ 2,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*16+ 3,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*16+ 4,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 10),mv); VSTO(op,i*16+ 5,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 12),mv); VSTO(op,i*16+ 6,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 14),mv); VSTO(op,i*16+ 7,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 16),mv); VSTO(op,i*16+ 8,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 18),mv); VSTO(op,i*16+ 9,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 20),mv); VSTO(op,i*16+10,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 22),mv); VSTO(op,i*16+11,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 24),mv); VSTO(op,i*16+12,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 26),mv); VSTO(op,i*16+13,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 28),mv); VSTO(op,i*16+14,ov,parm); \ + ov = _mm256_srli_epi32(iv, 30); VSTO(op,i*16+15,ov,parm); ;\ +} + +#define BITUNPACK256V32_2(ip, op, parm) {\ + BITUNBLK256V32_2(ip, 0, op, parm);\ + BITUNBLK256V32_2(ip, 1, op, parm);\ +} + +#define BITUNBLK256V32_3(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_and_si256( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 3),mv); VSTO(op,i*32+ 1,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*32+ 2,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 9),mv); VSTO(op,i*32+ 3,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 12),mv); VSTO(op,i*32+ 4,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 15),mv); VSTO(op,i*32+ 5,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 18),mv); VSTO(op,i*32+ 6,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 21),mv); VSTO(op,i*32+ 7,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 24),mv); VSTO(op,i*32+ 8,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 27),mv); VSTO(op,i*32+ 9,ov,parm); \ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 1),mv); VSTO(op,i*32+11,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*32+12,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 7),mv); VSTO(op,i*32+13,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 10),mv); VSTO(op,i*32+14,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 13),mv); VSTO(op,i*32+15,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 16),mv); VSTO(op,i*32+16,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 19),mv); VSTO(op,i*32+17,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 22),mv); VSTO(op,i*32+18,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 25),mv); VSTO(op,i*32+19,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 28),mv); VSTO(op,i*32+20,ov,parm); \ + ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*32+22,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 5),mv); VSTO(op,i*32+23,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*32+24,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 11),mv); VSTO(op,i*32+25,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 14),mv); VSTO(op,i*32+26,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 17),mv); VSTO(op,i*32+27,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 20),mv); VSTO(op,i*32+28,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 23),mv); VSTO(op,i*32+29,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 26),mv); VSTO(op,i*32+30,ov,parm); \ + ov = _mm256_srli_epi32(iv, 29); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACK256V32_3(ip, op, parm) {\ + BITUNBLK256V32_3(ip, 0, op, parm);\ +} + +#define BITUNBLK256V32_4(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_and_si256( iv ,mv); VSTO(op,i*8+ 0,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*8+ 1,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*8+ 2,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 12),mv); VSTO(op,i*8+ 3,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 16),mv); VSTO(op,i*8+ 4,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 20),mv); VSTO(op,i*8+ 5,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 24),mv); VSTO(op,i*8+ 6,ov,parm); \ + ov = _mm256_srli_epi32(iv, 28); VSTO(op,i*8+ 7,ov,parm); ;\ +} + +#define BITUNPACK256V32_4(ip, op, parm) {\ + BITUNBLK256V32_4(ip, 0, op, parm);\ + BITUNBLK256V32_4(ip, 1, op, parm);\ + BITUNBLK256V32_4(ip, 2, op, parm);\ + BITUNBLK256V32_4(ip, 3, op, parm);\ +} + +#define BITUNBLK256V32_5(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_and_si256( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 5),mv); VSTO(op,i*32+ 1,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 10),mv); VSTO(op,i*32+ 2,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 15),mv); VSTO(op,i*32+ 3,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 20),mv); VSTO(op,i*32+ 4,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 25),mv); VSTO(op,i*32+ 5,ov,parm); \ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 3),mv); VSTO(op,i*32+ 7,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*32+ 8,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 13),mv); VSTO(op,i*32+ 9,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 18),mv); VSTO(op,i*32+10,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 23),mv); VSTO(op,i*32+11,ov,parm); \ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 1),mv); VSTO(op,i*32+13,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*32+14,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 11),mv); VSTO(op,i*32+15,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 16),mv); VSTO(op,i*32+16,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 21),mv); VSTO(op,i*32+17,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 26),mv); VSTO(op,i*32+18,ov,parm); \ + ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*32+20,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 9),mv); VSTO(op,i*32+21,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 14),mv); VSTO(op,i*32+22,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 19),mv); VSTO(op,i*32+23,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 24),mv); VSTO(op,i*32+24,ov,parm); \ + ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*32+26,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 7),mv); VSTO(op,i*32+27,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 12),mv); VSTO(op,i*32+28,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 17),mv); VSTO(op,i*32+29,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 22),mv); VSTO(op,i*32+30,ov,parm); \ + ov = _mm256_srli_epi32(iv, 27); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACK256V32_5(ip, op, parm) {\ + BITUNBLK256V32_5(ip, 0, op, parm);\ +} + +#define BITUNBLK256V32_6(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_and_si256( iv ,mv); VSTO(op,i*16+ 0,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*16+ 1,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 12),mv); VSTO(op,i*16+ 2,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 18),mv); VSTO(op,i*16+ 3,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 24),mv); VSTO(op,i*16+ 4,ov,parm); \ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 5,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*16+ 6,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 10),mv); VSTO(op,i*16+ 7,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 16),mv); VSTO(op,i*16+ 8,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 22),mv); VSTO(op,i*16+ 9,ov,parm); \ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*16+10,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*16+11,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*16+12,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 14),mv); VSTO(op,i*16+13,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 20),mv); VSTO(op,i*16+14,ov,parm); \ + ov = _mm256_srli_epi32(iv, 26); VSTO(op,i*16+15,ov,parm); ;\ +} + +#define BITUNPACK256V32_6(ip, op, parm) {\ + BITUNBLK256V32_6(ip, 0, op, parm);\ + BITUNBLK256V32_6(ip, 1, op, parm);\ +} + +#define BITUNBLK256V32_7(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_and_si256( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 7),mv); VSTO(op,i*32+ 1,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 14),mv); VSTO(op,i*32+ 2,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 21),mv); VSTO(op,i*32+ 3,ov,parm); \ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 3),mv); VSTO(op,i*32+ 5,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 10),mv); VSTO(op,i*32+ 6,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 17),mv); VSTO(op,i*32+ 7,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 24),mv); VSTO(op,i*32+ 8,ov,parm); \ + ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*32+10,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 13),mv); VSTO(op,i*32+11,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 20),mv); VSTO(op,i*32+12,ov,parm); \ + ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*32+14,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 9),mv); VSTO(op,i*32+15,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 16),mv); VSTO(op,i*32+16,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 23),mv); VSTO(op,i*32+17,ov,parm); \ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 5),mv); VSTO(op,i*32+19,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 12),mv); VSTO(op,i*32+20,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 19),mv); VSTO(op,i*32+21,ov,parm); \ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 1),mv); VSTO(op,i*32+23,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*32+24,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 15),mv); VSTO(op,i*32+25,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 22),mv); VSTO(op,i*32+26,ov,parm); \ + ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*32+28,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 11),mv); VSTO(op,i*32+29,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 18),mv); VSTO(op,i*32+30,ov,parm); \ + ov = _mm256_srli_epi32(iv, 25); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACK256V32_7(ip, op, parm) {\ + BITUNBLK256V32_7(ip, 0, op, parm);\ +} + +#define BITUNBLK256V32_8(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_and_si256( iv ,mv); VSTO(op,i*4+ 0,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*4+ 1,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 16),mv); VSTO(op,i*4+ 2,ov,parm); \ + ov = _mm256_srli_epi32(iv, 24); VSTO(op,i*4+ 3,ov,parm); ;\ +} + +#define BITUNPACK256V32_8(ip, op, parm) {\ + BITUNBLK256V32_8(ip, 0, op, parm);\ + BITUNBLK256V32_8(ip, 1, op, parm);\ + BITUNBLK256V32_8(ip, 2, op, parm);\ + BITUNBLK256V32_8(ip, 3, op, parm);\ + BITUNBLK256V32_8(ip, 4, op, parm);\ + BITUNBLK256V32_8(ip, 5, op, parm);\ + BITUNBLK256V32_8(ip, 6, op, parm);\ + BITUNBLK256V32_8(ip, 7, op, parm);\ +} + +#define BITUNBLK256V32_9(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_and_si256( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 9),mv); VSTO(op,i*32+ 1,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 18),mv); VSTO(op,i*32+ 2,ov,parm); \ + ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*32+ 4,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 13),mv); VSTO(op,i*32+ 5,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 22),mv); VSTO(op,i*32+ 6,ov,parm); \ + ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*32+ 8,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 17),mv); VSTO(op,i*32+ 9,ov,parm); \ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 3),mv); VSTO(op,i*32+11,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 12),mv); VSTO(op,i*32+12,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 21),mv); VSTO(op,i*32+13,ov,parm); \ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 7),mv); VSTO(op,i*32+15,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 16),mv); VSTO(op,i*32+16,ov,parm); \ + ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*32+18,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 11),mv); VSTO(op,i*32+19,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 20),mv); VSTO(op,i*32+20,ov,parm); \ + ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*32+22,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 15),mv); VSTO(op,i*32+23,ov,parm); \ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 1),mv); VSTO(op,i*32+25,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 10),mv); VSTO(op,i*32+26,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 19),mv); VSTO(op,i*32+27,ov,parm); \ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 5),mv); VSTO(op,i*32+29,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 14),mv); VSTO(op,i*32+30,ov,parm); \ + ov = _mm256_srli_epi32(iv, 23); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACK256V32_9(ip, op, parm) {\ + BITUNBLK256V32_9(ip, 0, op, parm);\ +} + +#define BITUNBLK256V32_10(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_and_si256( iv ,mv); VSTO(op,i*16+ 0,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 10),mv); VSTO(op,i*16+ 1,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 20),mv); VSTO(op,i*16+ 2,ov,parm); \ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 3,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*16+ 4,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 18),mv); VSTO(op,i*16+ 5,ov,parm); \ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 6,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*16+ 7,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 16),mv); VSTO(op,i*16+ 8,ov,parm); \ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 9,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*16+10,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 14),mv); VSTO(op,i*16+11,ov,parm); \ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*16+12,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*16+13,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 12),mv); VSTO(op,i*16+14,ov,parm); \ + ov = _mm256_srli_epi32(iv, 22); VSTO(op,i*16+15,ov,parm); ;\ +} + +#define BITUNPACK256V32_10(ip, op, parm) {\ + BITUNBLK256V32_10(ip, 0, op, parm);\ + BITUNBLK256V32_10(ip, 1, op, parm);\ +} + +#define BITUNBLK256V32_11(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_and_si256( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 11),mv); VSTO(op,i*32+ 1,ov,parm); \ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 1),mv); VSTO(op,i*32+ 3,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 12),mv); VSTO(op,i*32+ 4,ov,parm); \ + ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*32+ 6,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 13),mv); VSTO(op,i*32+ 7,ov,parm); \ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 3),mv); VSTO(op,i*32+ 9,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 14),mv); VSTO(op,i*32+10,ov,parm); \ + ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*32+12,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 15),mv); VSTO(op,i*32+13,ov,parm); \ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 5),mv); VSTO(op,i*32+15,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 16),mv); VSTO(op,i*32+16,ov,parm); \ + ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*32+18,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 17),mv); VSTO(op,i*32+19,ov,parm); \ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 7),mv); VSTO(op,i*32+21,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 18),mv); VSTO(op,i*32+22,ov,parm); \ + ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*32+24,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 19),mv); VSTO(op,i*32+25,ov,parm); \ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 9),mv); VSTO(op,i*32+27,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 20),mv); VSTO(op,i*32+28,ov,parm); \ + ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 10),mv); VSTO(op,i*32+30,ov,parm); \ + ov = _mm256_srli_epi32(iv, 21); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACK256V32_11(ip, op, parm) {\ + BITUNBLK256V32_11(ip, 0, op, parm);\ +} + +#define BITUNBLK256V32_12(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_and_si256( iv ,mv); VSTO(op,i*8+ 0,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 12),mv); VSTO(op,i*8+ 1,ov,parm); \ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*8+ 2,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*8+ 3,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 16),mv); VSTO(op,i*8+ 4,ov,parm); \ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*8+ 5,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*8+ 6,ov,parm); \ + ov = _mm256_srli_epi32(iv, 20); VSTO(op,i*8+ 7,ov,parm); ;\ +} + +#define BITUNPACK256V32_12(ip, op, parm) {\ + BITUNBLK256V32_12(ip, 0, op, parm);\ + BITUNBLK256V32_12(ip, 1, op, parm);\ + BITUNBLK256V32_12(ip, 2, op, parm);\ + BITUNBLK256V32_12(ip, 3, op, parm);\ +} + +#define BITUNBLK256V32_13(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_and_si256( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 13),mv); VSTO(op,i*32+ 1,ov,parm); \ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 7),mv); VSTO(op,i*32+ 3,ov,parm); \ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 1),mv); VSTO(op,i*32+ 5,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 14),mv); VSTO(op,i*32+ 6,ov,parm); \ + ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*32+ 8,ov,parm); \ + ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*32+10,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 15),mv); VSTO(op,i*32+11,ov,parm); \ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 9),mv); VSTO(op,i*32+13,ov,parm); \ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 3),mv); VSTO(op,i*32+15,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 16),mv); VSTO(op,i*32+16,ov,parm); \ + ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 10),mv); VSTO(op,i*32+18,ov,parm); \ + ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*32+20,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 17),mv); VSTO(op,i*32+21,ov,parm); \ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 11),mv); VSTO(op,i*32+23,ov,parm); \ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 5),mv); VSTO(op,i*32+25,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 18),mv); VSTO(op,i*32+26,ov,parm); \ + ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 12),mv); VSTO(op,i*32+28,ov,parm); \ + ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*32+30,ov,parm); \ + ov = _mm256_srli_epi32(iv, 19); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACK256V32_13(ip, op, parm) {\ + BITUNBLK256V32_13(ip, 0, op, parm);\ +} + +#define BITUNBLK256V32_14(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_and_si256( iv ,mv); VSTO(op,i*16+ 0,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 14),mv); VSTO(op,i*16+ 1,ov,parm); \ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 2,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 10),mv); VSTO(op,i*16+ 3,ov,parm); \ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*16+ 4,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*16+ 5,ov,parm); \ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*16+ 6,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*16+ 7,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 16),mv); VSTO(op,i*16+ 8,ov,parm); \ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 9,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 12),mv); VSTO(op,i*16+10,ov,parm); \ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*16+11,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*16+12,ov,parm); \ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*16+13,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*16+14,ov,parm); \ + ov = _mm256_srli_epi32(iv, 18); VSTO(op,i*16+15,ov,parm); ;\ +} + +#define BITUNPACK256V32_14(ip, op, parm) {\ + BITUNBLK256V32_14(ip, 0, op, parm);\ + BITUNBLK256V32_14(ip, 1, op, parm);\ +} + +#define BITUNBLK256V32_15(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_and_si256( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 15),mv); VSTO(op,i*32+ 1,ov,parm); \ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 13),mv); VSTO(op,i*32+ 3,ov,parm); \ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 11),mv); VSTO(op,i*32+ 5,ov,parm); \ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 9),mv); VSTO(op,i*32+ 7,ov,parm); \ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 7),mv); VSTO(op,i*32+ 9,ov,parm); \ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 5),mv); VSTO(op,i*32+11,ov,parm); \ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 3),mv); VSTO(op,i*32+13,ov,parm); \ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 1),mv); VSTO(op,i*32+15,ov,parm); \ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 16),mv); VSTO(op,i*32+16,ov,parm); \ + ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 14),mv); VSTO(op,i*32+18,ov,parm); \ + ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 12),mv); VSTO(op,i*32+20,ov,parm); \ + ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 10),mv); VSTO(op,i*32+22,ov,parm); \ + ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*32+24,ov,parm); \ + ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*32+26,ov,parm); \ + ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*32+28,ov,parm); \ + ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*32+30,ov,parm); \ + ov = _mm256_srli_epi32(iv, 17); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACK256V32_15(ip, op, parm) {\ + BITUNBLK256V32_15(ip, 0, op, parm);\ +} + +#define BITUNBLK256V32_16(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_and_si256( iv ,mv); VSTO(op,i*2+ 0,ov,parm); \ + ov = _mm256_srli_epi32(iv, 16); VSTO(op,i*2+ 1,ov,parm); ;\ +} + +#define BITUNPACK256V32_16(ip, op, parm) {\ + BITUNBLK256V32_16(ip, 0, op, parm);\ + BITUNBLK256V32_16(ip, 1, op, parm);\ + BITUNBLK256V32_16(ip, 2, op, parm);\ + BITUNBLK256V32_16(ip, 3, op, parm);\ + BITUNBLK256V32_16(ip, 4, op, parm);\ + BITUNBLK256V32_16(ip, 5, op, parm);\ + BITUNBLK256V32_16(ip, 6, op, parm);\ + BITUNBLK256V32_16(ip, 7, op, parm);\ + BITUNBLK256V32_16(ip, 8, op, parm);\ + BITUNBLK256V32_16(ip, 9, op, parm);\ + BITUNBLK256V32_16(ip, 10, op, parm);\ + BITUNBLK256V32_16(ip, 11, op, parm);\ + BITUNBLK256V32_16(ip, 12, op, parm);\ + BITUNBLK256V32_16(ip, 13, op, parm);\ + BITUNBLK256V32_16(ip, 14, op, parm);\ + BITUNBLK256V32_16(ip, 15, op, parm);\ +} + +#define BITUNBLK256V32_17(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_and_si256( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*32+ 2,ov,parm); \ + ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*32+ 4,ov,parm); \ + ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*32+ 6,ov,parm); \ + ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*32+ 8,ov,parm); \ + ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 10),mv); VSTO(op,i*32+10,ov,parm); \ + ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 12),mv); VSTO(op,i*32+12,ov,parm); \ + ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 14),mv); VSTO(op,i*32+14,ov,parm); \ + ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 1),mv); VSTO(op,i*32+17,ov,parm); \ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 3),mv); VSTO(op,i*32+19,ov,parm); \ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 5),mv); VSTO(op,i*32+21,ov,parm); \ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 7),mv); VSTO(op,i*32+23,ov,parm); \ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 9),mv); VSTO(op,i*32+25,ov,parm); \ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 11),mv); VSTO(op,i*32+27,ov,parm); \ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 13),mv); VSTO(op,i*32+29,ov,parm); \ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm256_srli_epi32(iv, 15); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACK256V32_17(ip, op, parm) {\ + BITUNBLK256V32_17(ip, 0, op, parm);\ +} + +#define BITUNBLK256V32_18(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_and_si256( iv ,mv); VSTO(op,i*16+ 0,ov,parm); \ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*16+ 1,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*16+ 2,ov,parm); \ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 3,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*16+ 4,ov,parm); \ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 5,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 12),mv); VSTO(op,i*16+ 6,ov,parm); \ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 7,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*16+ 9,ov,parm); \ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*16+10,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*16+11,ov,parm); \ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*16+12,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 10),mv); VSTO(op,i*16+13,ov,parm); \ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*16+14,ov,parm);\ + ov = _mm256_srli_epi32(iv, 14); VSTO(op,i*16+15,ov,parm); ;\ +} + +#define BITUNPACK256V32_18(ip, op, parm) {\ + BITUNBLK256V32_18(ip, 0, op, parm);\ + BITUNBLK256V32_18(ip, 1, op, parm);\ +} + +#define BITUNBLK256V32_19(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_and_si256( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*32+ 2,ov,parm); \ + ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 12),mv); VSTO(op,i*32+ 4,ov,parm); \ + ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 5),mv); VSTO(op,i*32+ 7,ov,parm); \ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 11),mv); VSTO(op,i*32+ 9,ov,parm); \ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*32+12,ov,parm); \ + ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 10),mv); VSTO(op,i*32+14,ov,parm); \ + ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 3),mv); VSTO(op,i*32+17,ov,parm); \ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 9),mv); VSTO(op,i*32+19,ov,parm); \ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*32+22,ov,parm); \ + ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*32+24,ov,parm); \ + ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 1),mv); VSTO(op,i*32+27,ov,parm); \ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 7),mv); VSTO(op,i*32+29,ov,parm); \ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm256_srli_epi32(iv, 13); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACK256V32_19(ip, op, parm) {\ + BITUNBLK256V32_19(ip, 0, op, parm);\ +} + +#define BITUNBLK256V32_20(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_and_si256( iv ,mv); VSTO(op,i*8+ 0,ov,parm); \ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*8+ 1,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*8+ 2,ov,parm); \ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*8+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*8+ 4,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*8+ 5,ov,parm); \ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*8+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); VSTO(op,i*8+ 7,ov,parm); ;\ +} + +#define BITUNPACK256V32_20(ip, op, parm) {\ + BITUNBLK256V32_20(ip, 0, op, parm);\ + BITUNBLK256V32_20(ip, 1, op, parm);\ + BITUNBLK256V32_20(ip, 2, op, parm);\ + BITUNBLK256V32_20(ip, 3, op, parm);\ +} + +#define BITUNBLK256V32_21(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_and_si256( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 10),mv); VSTO(op,i*32+ 2,ov,parm); \ + ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 9),mv); VSTO(op,i*32+ 5,ov,parm); \ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*32+ 8,ov,parm); \ + ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 7),mv); VSTO(op,i*32+11,ov,parm); \ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*32+14,ov,parm); \ + ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 5),mv); VSTO(op,i*32+17,ov,parm); \ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*32+20,ov,parm); \ + ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 3),mv); VSTO(op,i*32+23,ov,parm); \ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*32+26,ov,parm); \ + ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 1),mv); VSTO(op,i*32+29,ov,parm); \ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm256_srli_epi32(iv, 11); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACK256V32_21(ip, op, parm) {\ + BITUNBLK256V32_21(ip, 0, op, parm);\ +} + +#define BITUNBLK256V32_22(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_and_si256( iv ,mv); VSTO(op,i*16+ 0,ov,parm); \ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*16+ 2,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*16+ 3,ov,parm); \ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*16+ 4,ov,parm);\ + ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*16+ 5,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*16+ 6,ov,parm); \ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 7,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*16+ 9,ov,parm); \ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*16+10,ov,parm);\ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*16+11,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*16+12,ov,parm); \ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*16+13,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*16+14,ov,parm);\ + ov = _mm256_srli_epi32(iv, 10); VSTO(op,i*16+15,ov,parm); ;\ +} + +#define BITUNPACK256V32_22(ip, op, parm) {\ + BITUNBLK256V32_22(ip, 0, op, parm);\ + BITUNBLK256V32_22(ip, 1, op, parm);\ +} + +#define BITUNBLK256V32_23(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_and_si256( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 5),mv); VSTO(op,i*32+ 3,ov,parm); \ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 1),mv); VSTO(op,i*32+ 7,ov,parm); \ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*32+10,ov,parm); \ + ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*32+14,ov,parm); \ + ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 7),mv); VSTO(op,i*32+17,ov,parm); \ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 3),mv); VSTO(op,i*32+21,ov,parm); \ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*32+24,ov,parm); \ + ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*32+28,ov,parm); \ + ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm256_srli_epi32(iv, 9); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACK256V32_23(ip, op, parm) {\ + BITUNBLK256V32_23(ip, 0, op, parm);\ +} + +#define BITUNBLK256V32_24(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_and_si256( iv ,mv); VSTO(op,i*4+ 0,ov,parm); \ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*4+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*4+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); VSTO(op,i*4+ 3,ov,parm); ;\ +} + +#define BITUNPACK256V32_24(ip, op, parm) {\ + BITUNBLK256V32_24(ip, 0, op, parm);\ + BITUNBLK256V32_24(ip, 1, op, parm);\ + BITUNBLK256V32_24(ip, 2, op, parm);\ + BITUNBLK256V32_24(ip, 3, op, parm);\ + BITUNBLK256V32_24(ip, 4, op, parm);\ + BITUNBLK256V32_24(ip, 5, op, parm);\ + BITUNBLK256V32_24(ip, 6, op, parm);\ + BITUNBLK256V32_24(ip, 7, op, parm);\ +} + +#define BITUNBLK256V32_25(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_and_si256( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*32+ 4,ov,parm); \ + ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 1),mv); VSTO(op,i*32+ 9,ov,parm); \ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 5),mv); VSTO(op,i*32+13,ov,parm); \ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*32+18,ov,parm); \ + ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*32+22,ov,parm); \ + ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 3),mv); VSTO(op,i*32+27,ov,parm); \ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm256_srli_epi32(iv, 7); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACK256V32_25(ip, op, parm) {\ + BITUNBLK256V32_25(ip, 0, op, parm);\ +} + +#define BITUNBLK256V32_26(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_and_si256( iv ,mv); VSTO(op,i*16+ 0,ov,parm); \ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*16+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*16+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*16+ 4,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*16+ 5,ov,parm); \ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 7,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ + ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*16+ 9,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*16+10,ov,parm); \ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*16+11,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*16+12,ov,parm);\ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*16+13,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*16+14,ov,parm);\ + ov = _mm256_srli_epi32(iv, 6); VSTO(op,i*16+15,ov,parm); ;\ +} + +#define BITUNPACK256V32_26(ip, op, parm) {\ + BITUNBLK256V32_26(ip, 0, op, parm);\ + BITUNBLK256V32_26(ip, 1, op, parm);\ +} + +#define BITUNBLK256V32_27(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_and_si256( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*32+ 6,ov,parm); \ + ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*32+12,ov,parm); \ + ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 1),mv); VSTO(op,i*32+19,ov,parm); \ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 3),mv); VSTO(op,i*32+25,ov,parm); \ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm256_srli_epi32(iv, 5); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACK256V32_27(ip, op, parm) {\ + BITUNBLK256V32_27(ip, 0, op, parm);\ +} + +#define BITUNBLK256V32_28(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_and_si256( iv ,mv); VSTO(op,i*8+ 0,ov,parm); \ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*8+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*8+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*8+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*8+ 4,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*8+ 5,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*8+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 4); VSTO(op,i*8+ 7,ov,parm); ;\ +} + +#define BITUNPACK256V32_28(ip, op, parm) {\ + BITUNBLK256V32_28(ip, 0, op, parm);\ + BITUNBLK256V32_28(ip, 1, op, parm);\ + BITUNBLK256V32_28(ip, 2, op, parm);\ + BITUNBLK256V32_28(ip, 3, op, parm);\ +} + +#define BITUNBLK256V32_29(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_and_si256( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*32+10,ov,parm); \ + ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm256_and_si256(_mm256_srli_epi32(iv, 1),mv); VSTO(op,i*32+21,ov,parm); \ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm256_srli_epi32(iv, 3); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACK256V32_29(ip, op, parm) {\ + BITUNBLK256V32_29(ip, 0, op, parm);\ +} + +#define BITUNBLK256V32_30(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_and_si256( iv ,mv); VSTO(op,i*16+ 0,ov,parm); \ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*16+ 4,ov,parm);\ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 5,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*16+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*16+ 7,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ + ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*16+ 9,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*16+10,ov,parm);\ + ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*16+11,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*16+12,ov,parm);\ + ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*16+13,ov,parm);\ + ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*16+14,ov,parm);\ + ov = _mm256_srli_epi32(iv, 2); VSTO(op,i*16+15,ov,parm); ;\ +} + +#define BITUNPACK256V32_30(ip, op, parm) {\ + BITUNBLK256V32_30(ip, 0, op, parm);\ + BITUNBLK256V32_30(ip, 1, op, parm);\ +} + +#define BITUNBLK256V32_31(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_and_si256( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm256_srli_epi32(iv, 3); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 29), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm256_srli_epi32(iv, 1); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACK256V32_31(ip, op, parm) {\ + BITUNBLK256V32_31(ip, 0, op, parm);\ +} + +#define BITUNBLK256V32_32(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_and_si256( iv ,mv); VSTO(op,i*1+ 0,ov,parm); ;\ +} + +#define BITUNPACK256V32_32(ip, op, parm) {\ + BITUNBLK256V32_32(ip, 0, op, parm);\ + BITUNBLK256V32_32(ip, 1, op, parm);\ + BITUNBLK256V32_32(ip, 2, op, parm);\ + BITUNBLK256V32_32(ip, 3, op, parm);\ + BITUNBLK256V32_32(ip, 4, op, parm);\ + BITUNBLK256V32_32(ip, 5, op, parm);\ + BITUNBLK256V32_32(ip, 6, op, parm);\ + BITUNBLK256V32_32(ip, 7, op, parm);\ + BITUNBLK256V32_32(ip, 8, op, parm);\ + BITUNBLK256V32_32(ip, 9, op, parm);\ + BITUNBLK256V32_32(ip, 10, op, parm);\ + BITUNBLK256V32_32(ip, 11, op, parm);\ + BITUNBLK256V32_32(ip, 12, op, parm);\ + BITUNBLK256V32_32(ip, 13, op, parm);\ + BITUNBLK256V32_32(ip, 14, op, parm);\ + BITUNBLK256V32_32(ip, 15, op, parm);\ + BITUNBLK256V32_32(ip, 16, op, parm);\ + BITUNBLK256V32_32(ip, 17, op, parm);\ + BITUNBLK256V32_32(ip, 18, op, parm);\ + BITUNBLK256V32_32(ip, 19, op, parm);\ + BITUNBLK256V32_32(ip, 20, op, parm);\ + BITUNBLK256V32_32(ip, 21, op, parm);\ + BITUNBLK256V32_32(ip, 22, op, parm);\ + BITUNBLK256V32_32(ip, 23, op, parm);\ + BITUNBLK256V32_32(ip, 24, op, parm);\ + BITUNBLK256V32_32(ip, 25, op, parm);\ + BITUNBLK256V32_32(ip, 26, op, parm);\ + BITUNBLK256V32_32(ip, 27, op, parm);\ + BITUNBLK256V32_32(ip, 28, op, parm);\ + BITUNBLK256V32_32(ip, 29, op, parm);\ + BITUNBLK256V32_32(ip, 30, op, parm);\ + BITUNBLK256V32_32(ip, 31, op, parm);\ +} + +#define BITUNBLK256V32_33(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ + ov = _mm256_srli_epi32(iv, 1); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 31), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 3); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 29), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+31,ov,parm);;\ +} + +#define BITUNPACK256V32_33(ip, op, parm) {\ + BITUNBLK256V32_33(ip, 0, op, parm);\ +} + +#define BITUNBLK256V32_34(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*16+ 0,ov,parm);\ + ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*16+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*16+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*16+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*16+ 4,ov,parm);\ + ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*16+ 5,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*16+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*16+ 7,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*16+ 9,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*16+10,ov,parm);\ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*16+11,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*16+12,ov,parm);\ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*16+13,ov,parm);\ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*16+14,ov,parm);\ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*16+15,ov,parm);;\ +} + +#define BITUNPACK256V32_34(ip, op, parm) {\ + BITUNBLK256V32_34(ip, 0, op, parm);\ + BITUNBLK256V32_34(ip, 1, op, parm);\ +} + +#define BITUNBLK256V32_35(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ + ov = _mm256_srli_epi32(iv, 3); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 29), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm256_srli_epi32(iv, 1); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 31), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+31,ov,parm);;\ +} + +#define BITUNPACK256V32_35(ip, op, parm) {\ + BITUNBLK256V32_35(ip, 0, op, parm);\ +} + +#define BITUNBLK256V32_36(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*8+ 0,ov,parm);\ + ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*8+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*8+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*8+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*8+ 4,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*8+ 5,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*8+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*8+ 7,ov,parm);;\ +} + +#define BITUNPACK256V32_36(ip, op, parm) {\ + BITUNBLK256V32_36(ip, 0, op, parm);\ + BITUNBLK256V32_36(ip, 1, op, parm);\ + BITUNBLK256V32_36(ip, 2, op, parm);\ + BITUNBLK256V32_36(ip, 3, op, parm);\ +} + +#define BITUNBLK256V32_37(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ + ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 3); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 29), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm256_srli_epi32(iv, 1); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 31), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+31,ov,parm);;\ +} + +#define BITUNPACK256V32_37(ip, op, parm) {\ + BITUNBLK256V32_37(ip, 0, op, parm);\ +} + +#define BITUNBLK256V32_38(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*16+ 0,ov,parm);\ + ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*16+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*16+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*16+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*16+ 4,ov,parm);\ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 5,ov,parm);\ + ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*16+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*16+ 7,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 9,ov,parm);\ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*16+10,ov,parm);\ + ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*16+11,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*16+12,ov,parm);\ + ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*16+13,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*16+14,ov,parm);\ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*16+15,ov,parm);;\ +} + +#define BITUNPACK256V32_38(ip, op, parm) {\ + BITUNBLK256V32_38(ip, 0, op, parm);\ + BITUNBLK256V32_38(ip, 1, op, parm);\ +} + +#define BITUNBLK256V32_39(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ + ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm256_srli_epi32(iv, 3); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 29), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm256_srli_epi32(iv, 1); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 31), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+31,ov,parm);;\ +} + +#define BITUNPACK256V32_39(ip, op, parm) {\ + BITUNBLK256V32_39(ip, 0, op, parm);\ +} + +#define BITUNBLK256V32_40(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*4+ 0,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*4+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*4+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*4+ 3,ov,parm);;\ +} + +#define BITUNPACK256V32_40(ip, op, parm) {\ + BITUNBLK256V32_40(ip, 0, op, parm);\ + BITUNBLK256V32_40(ip, 1, op, parm);\ + BITUNBLK256V32_40(ip, 2, op, parm);\ + BITUNBLK256V32_40(ip, 3, op, parm);\ + BITUNBLK256V32_40(ip, 4, op, parm);\ + BITUNBLK256V32_40(ip, 5, op, parm);\ + BITUNBLK256V32_40(ip, 6, op, parm);\ + BITUNBLK256V32_40(ip, 7, op, parm);\ +} + +#define BITUNBLK256V32_41(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ + ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm256_srli_epi32(iv, 3); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 29), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm256_srli_epi32(iv, 1); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 31), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+31,ov,parm);;\ +} + +#define BITUNPACK256V32_41(ip, op, parm) {\ + BITUNBLK256V32_41(ip, 0, op, parm);\ +} + +#define BITUNBLK256V32_42(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*16+ 0,ov,parm);\ + ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*16+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*16+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*16+ 4,ov,parm);\ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*16+ 5,ov,parm);\ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*16+ 7,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 9,ov,parm);\ + ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*16+10,ov,parm);\ + ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*16+11,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*16+12,ov,parm);\ + ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*16+13,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*16+14,ov,parm);\ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*16+15,ov,parm);;\ +} + +#define BITUNPACK256V32_42(ip, op, parm) {\ + BITUNBLK256V32_42(ip, 0, op, parm);\ + BITUNBLK256V32_42(ip, 1, op, parm);\ +} + +#define BITUNBLK256V32_43(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ + ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 1); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 31), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm256_srli_epi32(iv, 3); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 29), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+31,ov,parm);;\ +} + +#define BITUNPACK256V32_43(ip, op, parm) {\ + BITUNBLK256V32_43(ip, 0, op, parm);\ +} + +#define BITUNBLK256V32_44(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*8+ 0,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*8+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*8+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*8+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*8+ 4,ov,parm);\ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*8+ 5,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*8+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*8+ 7,ov,parm);;\ +} + +#define BITUNPACK256V32_44(ip, op, parm) {\ + BITUNBLK256V32_44(ip, 0, op, parm);\ + BITUNBLK256V32_44(ip, 1, op, parm);\ + BITUNBLK256V32_44(ip, 2, op, parm);\ + BITUNBLK256V32_44(ip, 3, op, parm);\ +} + +#define BITUNBLK256V32_45(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ + ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm256_srli_epi32(iv, 1); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 31), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm256_srli_epi32(iv, 3); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 29), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+31,ov,parm);;\ +} + +#define BITUNPACK256V32_45(ip, op, parm) {\ + BITUNBLK256V32_45(ip, 0, op, parm);\ +} + +#define BITUNBLK256V32_46(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*16+ 0,ov,parm);\ + ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*16+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*16+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*16+ 4,ov,parm);\ + ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*16+ 5,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*16+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*16+ 7,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 9,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*16+10,ov,parm);\ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*16+11,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*16+12,ov,parm);\ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*16+13,ov,parm);\ + ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*16+14,ov,parm);\ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*16+15,ov,parm);;\ +} + +#define BITUNPACK256V32_46(ip, op, parm) {\ + BITUNBLK256V32_46(ip, 0, op, parm);\ + BITUNBLK256V32_46(ip, 1, op, parm);\ +} + +#define BITUNBLK256V32_47(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ + ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm256_srli_epi32(iv, 3); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 29), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm256_srli_epi32(iv, 1); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 31), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+31,ov,parm);;\ +} + +#define BITUNPACK256V32_47(ip, op, parm) {\ + BITUNBLK256V32_47(ip, 0, op, parm);\ +} + +#define BITUNBLK256V32_48(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*2+ 0,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*2+ 1,ov,parm);;\ +} + +#define BITUNPACK256V32_48(ip, op, parm) {\ + BITUNBLK256V32_48(ip, 0, op, parm);\ + BITUNBLK256V32_48(ip, 1, op, parm);\ + BITUNBLK256V32_48(ip, 2, op, parm);\ + BITUNBLK256V32_48(ip, 3, op, parm);\ + BITUNBLK256V32_48(ip, 4, op, parm);\ + BITUNBLK256V32_48(ip, 5, op, parm);\ + BITUNBLK256V32_48(ip, 6, op, parm);\ + BITUNBLK256V32_48(ip, 7, op, parm);\ + BITUNBLK256V32_48(ip, 8, op, parm);\ + BITUNBLK256V32_48(ip, 9, op, parm);\ + BITUNBLK256V32_48(ip, 10, op, parm);\ + BITUNBLK256V32_48(ip, 11, op, parm);\ + BITUNBLK256V32_48(ip, 12, op, parm);\ + BITUNBLK256V32_48(ip, 13, op, parm);\ + BITUNBLK256V32_48(ip, 14, op, parm);\ + BITUNBLK256V32_48(ip, 15, op, parm);\ +} + +#define BITUNBLK256V32_49(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ + ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm256_srli_epi32(iv, 1); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 31), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm256_srli_epi32(iv, 3); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 29), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+31,ov,parm);;\ +} + +#define BITUNPACK256V32_49(ip, op, parm) {\ + BITUNBLK256V32_49(ip, 0, op, parm);\ +} + +#define BITUNBLK256V32_50(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*16+ 0,ov,parm);\ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*16+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*16+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*16+ 4,ov,parm);\ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 5,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*16+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 7,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ + ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*16+ 9,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*16+10,ov,parm);\ + ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*16+11,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*16+12,ov,parm);\ + ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*16+13,ov,parm);\ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*16+14,ov,parm);\ + ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*16+15,ov,parm);;\ +} + +#define BITUNPACK256V32_50(ip, op, parm) {\ + BITUNBLK256V32_50(ip, 0, op, parm);\ + BITUNBLK256V32_50(ip, 1, op, parm);\ +} + +#define BITUNBLK256V32_51(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ + ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm256_srli_epi32(iv, 3); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 29), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm256_srli_epi32(iv, 1); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 31), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+31,ov,parm);;\ +} + +#define BITUNPACK256V32_51(ip, op, parm) {\ + BITUNBLK256V32_51(ip, 0, op, parm);\ +} + +#define BITUNBLK256V32_52(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*8+ 0,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*8+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*8+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*8+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*8+ 4,ov,parm);\ + ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*8+ 5,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*8+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*8+ 7,ov,parm);;\ +} + +#define BITUNPACK256V32_52(ip, op, parm) {\ + BITUNBLK256V32_52(ip, 0, op, parm);\ + BITUNBLK256V32_52(ip, 1, op, parm);\ + BITUNBLK256V32_52(ip, 2, op, parm);\ + BITUNBLK256V32_52(ip, 3, op, parm);\ +} + +#define BITUNBLK256V32_53(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ + ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm256_srli_epi32(iv, 3); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 29), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm256_srli_epi32(iv, 1); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 31), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+31,ov,parm);;\ +} + +#define BITUNPACK256V32_53(ip, op, parm) {\ + BITUNBLK256V32_53(ip, 0, op, parm);\ +} + +#define BITUNBLK256V32_54(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*16+ 0,ov,parm);\ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*16+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*16+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*16+ 4,ov,parm);\ + ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*16+ 5,ov,parm);\ + ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*16+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 7,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ + ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*16+ 9,ov,parm);\ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*16+10,ov,parm);\ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*16+11,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*16+12,ov,parm);\ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*16+13,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*16+14,ov,parm);\ + ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*16+15,ov,parm);;\ +} + +#define BITUNPACK256V32_54(ip, op, parm) {\ + BITUNBLK256V32_54(ip, 0, op, parm);\ + BITUNBLK256V32_54(ip, 1, op, parm);\ +} + +#define BITUNBLK256V32_55(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ + ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 1); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 31), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm256_srli_epi32(iv, 3); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 29), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+31,ov,parm);;\ +} + +#define BITUNPACK256V32_55(ip, op, parm) {\ + BITUNBLK256V32_55(ip, 0, op, parm);\ +} + +#define BITUNBLK256V32_56(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*4+ 0,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*4+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*4+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*4+ 3,ov,parm);;\ +} + +#define BITUNPACK256V32_56(ip, op, parm) {\ + BITUNBLK256V32_56(ip, 0, op, parm);\ + BITUNBLK256V32_56(ip, 1, op, parm);\ + BITUNBLK256V32_56(ip, 2, op, parm);\ + BITUNBLK256V32_56(ip, 3, op, parm);\ + BITUNBLK256V32_56(ip, 4, op, parm);\ + BITUNBLK256V32_56(ip, 5, op, parm);\ + BITUNBLK256V32_56(ip, 6, op, parm);\ + BITUNBLK256V32_56(ip, 7, op, parm);\ +} + +#define BITUNBLK256V32_57(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ + ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm256_srli_epi32(iv, 1); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 31), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm256_srli_epi32(iv, 3); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 29), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+31,ov,parm);;\ +} + +#define BITUNPACK256V32_57(ip, op, parm) {\ + BITUNBLK256V32_57(ip, 0, op, parm);\ +} + +#define BITUNBLK256V32_58(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*16+ 0,ov,parm);\ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*16+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*16+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*16+ 4,ov,parm);\ + ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*16+ 5,ov,parm);\ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 7,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ + ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*16+ 9,ov,parm);\ + ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*16+10,ov,parm);\ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*16+11,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*16+12,ov,parm);\ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*16+13,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*16+14,ov,parm);\ + ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*16+15,ov,parm);;\ +} + +#define BITUNPACK256V32_58(ip, op, parm) {\ + BITUNBLK256V32_58(ip, 0, op, parm);\ + BITUNBLK256V32_58(ip, 1, op, parm);\ +} + +#define BITUNBLK256V32_59(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ + ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm256_srli_epi32(iv, 1); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 31), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm256_srli_epi32(iv, 3); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 29), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+31,ov,parm);;\ +} + +#define BITUNPACK256V32_59(ip, op, parm) {\ + BITUNBLK256V32_59(ip, 0, op, parm);\ +} + +#define BITUNBLK256V32_60(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*8+ 0,ov,parm);\ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*8+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*8+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*8+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*8+ 4,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*8+ 5,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*8+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*8+ 7,ov,parm);;\ +} + +#define BITUNPACK256V32_60(ip, op, parm) {\ + BITUNBLK256V32_60(ip, 0, op, parm);\ + BITUNBLK256V32_60(ip, 1, op, parm);\ + BITUNBLK256V32_60(ip, 2, op, parm);\ + BITUNBLK256V32_60(ip, 3, op, parm);\ +} + +#define BITUNBLK256V32_61(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ + ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm256_srli_epi32(iv, 1); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 31), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm256_srli_epi32(iv, 3); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 29), mv)); VSTO(op,i*32+31,ov,parm);;\ +} + +#define BITUNPACK256V32_61(ip, op, parm) {\ + BITUNBLK256V32_61(ip, 0, op, parm);\ +} + +#define BITUNBLK256V32_62(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*16+ 0,ov,parm);\ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*16+ 4,ov,parm);\ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 5,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*16+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*16+ 7,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ + ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*16+ 9,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*16+10,ov,parm);\ + ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*16+11,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*16+12,ov,parm);\ + ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*16+13,ov,parm);\ + ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*16+14,ov,parm);\ + ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*16+15,ov,parm);;\ +} + +#define BITUNPACK256V32_62(ip, op, parm) {\ + BITUNBLK256V32_62(ip, 0, op, parm);\ + BITUNBLK256V32_62(ip, 1, op, parm);\ +} + +#define BITUNBLK256V32_63(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ + ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm256_srli_epi32(iv, 3); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 29), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm256_srli_epi32(iv, 1); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 31), mv)); VSTO(op,i*32+31,ov,parm);;\ +} + +#define BITUNPACK256V32_63(ip, op, parm) {\ + BITUNBLK256V32_63(ip, 0, op, parm);\ +} + +#define BITUNBLK256V32_64(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ + ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*1+ 0,ov,parm);;\ +} + +#define BITUNPACK256V32_64(ip, op, parm) {\ + BITUNBLK256V32_64(ip, 0, op, parm);\ + BITUNBLK256V32_64(ip, 1, op, parm);\ + BITUNBLK256V32_64(ip, 2, op, parm);\ + BITUNBLK256V32_64(ip, 3, op, parm);\ + BITUNBLK256V32_64(ip, 4, op, parm);\ + BITUNBLK256V32_64(ip, 5, op, parm);\ + BITUNBLK256V32_64(ip, 6, op, parm);\ + BITUNBLK256V32_64(ip, 7, op, parm);\ + BITUNBLK256V32_64(ip, 8, op, parm);\ + BITUNBLK256V32_64(ip, 9, op, parm);\ + BITUNBLK256V32_64(ip, 10, op, parm);\ + BITUNBLK256V32_64(ip, 11, op, parm);\ + BITUNBLK256V32_64(ip, 12, op, parm);\ + BITUNBLK256V32_64(ip, 13, op, parm);\ + BITUNBLK256V32_64(ip, 14, op, parm);\ + BITUNBLK256V32_64(ip, 15, op, parm);\ + BITUNBLK256V32_64(ip, 16, op, parm);\ + BITUNBLK256V32_64(ip, 17, op, parm);\ + BITUNBLK256V32_64(ip, 18, op, parm);\ + BITUNBLK256V32_64(ip, 19, op, parm);\ + BITUNBLK256V32_64(ip, 20, op, parm);\ + BITUNBLK256V32_64(ip, 21, op, parm);\ + BITUNBLK256V32_64(ip, 22, op, parm);\ + BITUNBLK256V32_64(ip, 23, op, parm);\ + BITUNBLK256V32_64(ip, 24, op, parm);\ + BITUNBLK256V32_64(ip, 25, op, parm);\ + BITUNBLK256V32_64(ip, 26, op, parm);\ + BITUNBLK256V32_64(ip, 27, op, parm);\ + BITUNBLK256V32_64(ip, 28, op, parm);\ + BITUNBLK256V32_64(ip, 29, op, parm);\ + BITUNBLK256V32_64(ip, 30, op, parm);\ + BITUNBLK256V32_64(ip, 31, op, parm);\ +} + diff --git a/bitunpack64_.h b/bitunpack64_.h index fdf91f0..70142d7 100644 --- a/bitunpack64_.h +++ b/bitunpack64_.h @@ -1,5 +1,5 @@ /** - Copyright (C) powturbo 2013-2015 + Copyright (C) powturbo 2013-2017 GPL v2 License This program is free software; you can redistribute it and/or modify @@ -21,7 +21,7 @@ - twitter : https://twitter.com/powturbo - email : powturbo [_AT_] gmail [_DOT_] com **/ -// bitunpack64_.h - "Integer Compression" scalar bit packing +// bitunpack include #define BITUNBLK32_0(ip, i, op, parm) { \ DST(op,i*0+ 0, 0, parm);\ DST(op,i*0+ 1, 0, parm);\ @@ -3101,3 +3101,4 @@ BITUNBLK64_64(ip, 30, op, parm);\ BITUNBLK64_64(ip, 31, op, parm); DSTI(op); ip += 64*4/sizeof(ip[0]);\ } + diff --git a/bitutil.c b/bitutil.c index 3caff40..a05e751 100644 --- a/bitutil.c +++ b/bitutil.c @@ -1,5 +1,5 @@ /** - Copyright (C) powturbo 2013-2016 + Copyright (C) powturbo 2013-2017 GPL v2 License This program is free software; you can redistribute it and/or modify @@ -21,20 +21,20 @@ - twitter : https://twitter.com/powturbo - email : powturbo [_AT_] gmail [_DOT_] com **/ -// bitutil.h - "Integer Compression" -#include "conf.h" +// "Integer Compression" utility - delta, for, zigzag +#include "conf.h" #include "bitutil.h" -#define BITDELTA(__p,__n, __inc, __start, __act) {\ - typeof(__p[0]) _x, *_p;\ - for(_p = __p; _p != __p+(__n&~(4-1)); ) {\ - _x = (*_p)-__start-__inc; __start = *_p++; __act;\ - _x = (*_p)-__start-__inc; __start = *_p++; __act;\ - _x = (*_p)-__start-__inc; __start = *_p++; __act;\ - _x = (*_p)-__start-__inc; __start = *_p++; __act;\ +#define BITDELTA(_p_,_n_, __inc, _start_, _act_) {\ + typeof(_p_[0]) _x, *_p;\ + for(_p = _p_; _p != _p_+(_n_&~(4-1)); ) {\ + _x = (*_p)-_start_-__inc; _start_ = *_p++; _act_;\ + _x = (*_p)-_start_-__inc; _start_ = *_p++; _act_;\ + _x = (*_p)-_start_-__inc; _start_ = *_p++; _act_;\ + _x = (*_p)-_start_-__inc; _start_ = *_p++; _act_;\ }\ - while(_p != __p+__n) { \ - _x = *_p-__start-__inc; __start = *_p++; __act;\ + while(_p != _p_+_n_) { \ + _x = *_p-_start_-__inc; _start_ = *_p++; _act_;\ }\ } @@ -51,15 +51,16 @@ }\ } -#define BITMINMAX(__p,__n, __mi, __mx) {\ - typeof(__p[0]) _x, *_p;\ - for(_p = __p, __mi = __mx = 0; _p != __p+(__n&~(4-1)); ) {\ + +#define BITMINMAX(_p_,_n_, __mi, __mx) {\ + typeof(_p_[0]) _x, *_p;\ + for(_p = _p_, __mi = __mx = 0; _p != _p_+(_n_&~(4-1)); ) {\ if(*_p < __mi) __mi = *_p; if(*_p > __mx) __mx = *_p; _p++; \ if(*_p < __mi) __mi = *_p; if(*_p > __mx) __mx = *_p; _p++; \ if(*_p < __mi) __mi = *_p; if(*_p > __mx) __mx = *_p; _p++; \ if(*_p < __mi) __mi = *_p; if(*_p > __mx) __mx = *_p; _p++; \ }\ - while(_p != __p+__n) { \ + while(_p != _p_+_n_) { \ if(*_p < __mi) __mi = *_p; if(*_p > __mx) __mx = *_p; _p++; \ }\ } @@ -70,12 +71,12 @@ unsigned bitdelta32(unsigned *in, unsigned n, unsigned *out, unsigned start, uns __m128i bv = _mm_setzero_si128(), sv = _mm_set1_epi32(start), cv = _mm_set1_epi32(inc), dv; for(ip = in; ip != in+(n&~(4-1)); ip += 4,op += 4) { __m128i iv = _mm_loadu_si128((__m128i *)ip); - bv = _mm_or_si128(bv, dv = _mm_sub_epi32(DELTA128_32(iv,sv),cv)); + bv = _mm_or_si128(bv, dv = _mm_sub_epi32(DELTA128x32(iv,sv),cv)); sv = iv; _mm_storeu_si128((__m128i *)op, dv); } start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(sv,12)); - HOR128_32(bv, b); + HOR128x32(bv, b); while(ip != in+n) { unsigned x = *ip-start-inc; start = *ip++; @@ -123,12 +124,12 @@ unsigned bitd32(unsigned *in, unsigned n, unsigned start) { unsigned *ip,b; __m128i bv = _mm_setzero_si128(), sv = _mm_set1_epi32(start); for(ip = in; ip != in+(n&~(4-1)); ip += 4) { __m128i iv = _mm_loadu_si128((__m128i *)ip); - bv = _mm_or_si128(bv, DELTA128_32(iv,sv)); + bv = _mm_or_si128(bv, DELTA128x32(iv,sv)); sv = iv; } start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(sv,12)); - HOR128_32(bv, b); + HOR128x32(bv, b); while(ip != in+n) { unsigned x = *ip-start; start = *ip++; @@ -146,12 +147,12 @@ unsigned bitd132(unsigned *in, unsigned n, unsigned start) { unsigned *ip,b; __m128i bv = _mm_setzero_si128(), sv = _mm_set1_epi32(start), cv = _mm_set1_epi32(1); for(ip = in; ip != in+(n&~(4-1)); ip += 4) { __m128i iv = _mm_loadu_si128((__m128i *)ip); - bv = _mm_or_si128(bv, _mm_sub_epi32(DELTA128_32(iv,sv),cv)); + bv = _mm_or_si128(bv, _mm_sub_epi32(DELTA128x32(iv,sv),cv)); sv = iv; } start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(sv,12)); - HOR128_32(bv, b); + HOR128x32(bv, b); while(ip != in+n) { unsigned x = *ip-start-1; start = *ip++; @@ -173,7 +174,7 @@ void bitund132(unsigned *p, unsigned n, unsigned x) { unsigned *ip; for(ip = p; ip != p+(n&~(4-1)); ip += 4) { __m128i v = _mm_loadu_si128((__m128i *)ip); - SCANI128_32(v, sv, cv); + SCANI128x32(v, sv, cv); _mm_storeu_si128((__m128i *)ip, sv); } x = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(sv,12)); @@ -190,29 +191,29 @@ void bitundx32(unsigned *p, unsigned n, unsigned x, unsigned inc) { BITUNDELTA(p void bitundx64(uint64_t *p, unsigned n, uint64_t x, unsigned inc) { BITUNDELTA(p, n, x, inc); } //----------------------------- zigzag -------------------------------------------------------- -#define BITZIGZAG(__p,__n, __start, __act) {\ - typeof(__p[0]) *_p;\ - for(_p = __p; _p != __p+(__n&~(4-1)); ) {\ - _x = ((int)(*_p)-(int)__start); _x = (_x << 1) ^ (_x >> (sizeof(_x)*8-1)); __start = *_p++; __act;\ - _x = ((int)(*_p)-(int)__start); _x = (_x << 1) ^ (_x >> (sizeof(_x)*8-1)); __start = *_p++; __act;\ - _x = ((int)(*_p)-(int)__start); _x = (_x << 1) ^ (_x >> (sizeof(_x)*8-1)); __start = *_p++; __act;\ - _x = ((int)(*_p)-(int)__start); _x = (_x << 1) ^ (_x >> (sizeof(_x)*8-1)); __start = *_p++; __act;\ +#define BITZIGZAG(_p_,_n_, _start_, _act_) {\ + typeof(_p_[0]) *_p;\ + for(_p = _p_; _p != _p_+(_n_&~(4-1)); ) {\ + _x = ((int)(*_p)-(int)_start_); _x = (_x << 1) ^ (_x >> (sizeof(_x)*8-1)); _start_ = *_p++; _act_;\ + _x = ((int)(*_p)-(int)_start_); _x = (_x << 1) ^ (_x >> (sizeof(_x)*8-1)); _start_ = *_p++; _act_;\ + _x = ((int)(*_p)-(int)_start_); _x = (_x << 1) ^ (_x >> (sizeof(_x)*8-1)); _start_ = *_p++; _act_;\ + _x = ((int)(*_p)-(int)_start_); _x = (_x << 1) ^ (_x >> (sizeof(_x)*8-1)); _start_ = *_p++; _act_;\ }\ - while(_p != __p+__n) { \ - _x = ((int)(*_p)-(int)__start); _x = (_x << 1) ^ (_x >> (sizeof(_x)*8-1)); __start = *_p++; __act;\ + while(_p != _p_+_n_) { \ + _x = ((int)(*_p)-(int)_start_); _x = (_x << 1) ^ (_x >> (sizeof(_x)*8-1)); _start_ = *_p++; _act_;\ }\ } -#define BITUNZIGZAG(__p, __n, __start) {\ - typeof(__p[0]) *_p, _z;\ - for(_p = __p; _p != __p+(__n&~(4-1)); ) {\ - _z = *_p; *_p = (__start += (_z >> 1 ^ -(_z & 1))); _p++;\ - _z = *_p; *_p = (__start += (_z >> 1 ^ -(_z & 1))); _p++;\ - _z = *_p; *_p = (__start += (_z >> 1 ^ -(_z & 1))); _p++;\ - _z = *_p; *_p = (__start += (_z >> 1 ^ -(_z & 1))); _p++;\ +#define BITUNZIGZAG(_p_, _n_, _start_) {\ + typeof(_p_[0]) *_p, _z;\ + for(_p = _p_; _p != _p_+(_n_&~(4-1)); ) {\ + _z = *_p; *_p = (_start_ += (_z >> 1 ^ -(_z & 1))); _p++;\ + _z = *_p; *_p = (_start_ += (_z >> 1 ^ -(_z & 1))); _p++;\ + _z = *_p; *_p = (_start_ += (_z >> 1 ^ -(_z & 1))); _p++;\ + _z = *_p; *_p = (_start_ += (_z >> 1 ^ -(_z & 1))); _p++;\ }\ - while(_p != __p+__n) {\ - _z = *_p; *_p = (__start += (_z >> 1 ^ -(_z & 1))); _p++;\ + while(_p != _p_+_n_) {\ + _z = *_p; *_p = (_start_ += (_z >> 1 ^ -(_z & 1))); _p++;\ }\ } @@ -222,13 +223,13 @@ unsigned bitz32(unsigned *in, unsigned n, unsigned start) { __m128i bv = _mm_setzero_si128(), sv = _mm_set1_epi32(start), dv; for(ip = in; ip != in+(n&~(4-1)); ip += 4) { __m128i iv = _mm_loadu_si128((__m128i *)ip); - dv = DELTA128_32(iv,sv); + dv = DELTA128x32(iv,sv); sv = iv; - dv = ZIGZAG128_32(dv); + dv = ZIGZAG128x32(dv); bv = _mm_or_si128(bv, dv); } start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(sv,12)); - HOR128_32(bv, b); + HOR128x32(bv, b); while(ip != in+n) { int x = ((int)(*ip)-(int)start); x = (x << 1) ^ (x >> 31); @@ -236,7 +237,7 @@ unsigned bitz32(unsigned *in, unsigned n, unsigned start) { b |= x; } #else - typeof(in[0]) b = 0,*op = out; + typeof(in[0]) b = 0; int _x; BITZIGZAG(in, n, start, b |= (unsigned)_x); #endif @@ -249,14 +250,14 @@ unsigned bitzigzag32(unsigned *in, unsigned n, unsigned *out, unsigned start) { __m128i bv = _mm_setzero_si128(), sv = _mm_set1_epi32(start), dv; for(ip = in; ip != in+(n&~(4-1)); ip += 4,op += 4) { __m128i iv = _mm_loadu_si128((__m128i *)ip); - dv = DELTA128_32(iv,sv); + dv = DELTA128x32(iv,sv); sv = iv; - dv = ZIGZAG128_32(dv); + dv = ZIGZAG128x32(dv); bv = _mm_or_si128(bv, dv); _mm_storeu_si128((__m128i *)op, dv); } start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(sv,12)); - HOR128_32(bv, b); + HOR128x32(bv, b); while(ip != in+n) { int x = ((int)(*ip)-(int)start); x = (x << 1) ^ (x >> 31); @@ -278,8 +279,8 @@ void bitunzigzag32(unsigned *p, unsigned n, unsigned start) { unsigned *ip; for(ip = p; ip != p+(n&~(4-1)); ip += 4) { __m128i iv = _mm_loadu_si128((__m128i *)ip); - iv = UNZIGZAG128_32(iv); - SCAN128_32(iv, sv); + iv = UNZIGZAG128x32(iv); + SCAN128x32(iv, sv); _mm_storeu_si128((__m128i *)ip, sv); } start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(sv,12)); diff --git a/bitutil.h b/bitutil.h index 3ca9390..fd8a40a 100644 --- a/bitutil.h +++ b/bitutil.h @@ -1,5 +1,5 @@ /** - Copyright (C) powturbo 2013-2016 + Copyright (C) powturbo 2013-2017 GPL v2 License This program is free software; you can redistribute it and/or modify @@ -21,63 +21,108 @@ - twitter : https://twitter.com/powturbo - email : powturbo [_AT_] gmail [_DOT_] com **/ -// bitutil.h - "Integer Compression" +// "Integer Compression" #include -#define _BITFORZERO(_out_, _n_, _start_, _inc_) do { unsigned _i;\ - for(_i = 0; _i != (_n_&~3); ) {\ - _out_[_i] = _start_+_i*_inc_; _i++;\ - _out_[_i] = _start_+_i*_inc_; _i++;\ - _out_[_i] = _start_+_i*_inc_; _i++;\ - _out_[_i] = _start_+_i*_inc_; _i++;\ +#define BITFORSET_(_out_, _n_, _start_, _inc_) do { unsigned _i;\ + for(_i = 0; _i != (_n_&~3); _i+=4) {\ + _out_[_i+0] = _start_+(_i )*_inc_;\ + _out_[_i+1] = _start_+(_i+1)*_inc_;\ + _out_[_i+2] = _start_+(_i+2)*_inc_;\ + _out_[_i+3] = _start_+(_i+3)*_inc_;\ }\ while(_i != _n_)\ _out_[_i] = _start_+_i*_inc_, ++_i;\ } while(0) -#define BITSIZE(_in_, _n_, _b_, _usize_) { typeof(_in_[0]) *_ip;\ - for(_b_=0,_ip = _in_; _ip != _in_+(_n_&~(4-1)); )\ - _b_ |= *_ip++ | *_ip++ | *_ip++ | *_ip++;\ +#define BITSIZE_(_in_, _n_, _b_, _usize_) { typeof(_in_[0]) *_ip;\ + for(_b_=0,_ip = _in_; _ip != _in_+(_n_&~(4-1)); _ip+=4)\ + _b_ |= _ip[0] | _ip[1] | _ip[2] | _ip[3];\ while(_ip != _in_+_n_) \ _b_ |= *_ip++;\ - _b_ = TEMPLATE(bsr, _usize_)(_b_);\ + _b_ = TEMPLATE2(bsr, _usize_)(_b_);\ } -static inline uint64_t zigzagenc64(int64_t x) { return x << 1 ^ x >> 63; } -static inline uint64_t zigzagdec64(uint64_t x) { return x >> 1 ^ -(x & 1); } +static inline uint64_t zigzagenc64(int64_t x) { return x << 1 ^ x >> 63; } +static inline uint64_t zigzagdec64(uint64_t x) { return x >> 1 ^ -(x & 1); } -static inline unsigned zigzagenc32(int x) { return x << 1 ^ x >> 31; } -static inline unsigned zigzagdec32(unsigned x) { return x >> 1 ^ -(x & 1); } +static inline unsigned zigzagenc32(int x) { return x << 1 ^ x >> 31; } +static inline unsigned zigzagdec32(unsigned x) { return x >> 1 ^ -(x & 1); } -static inline unsigned zigzagenc31(int x) { x = (x << 2 | ((x>>30)& 2)) ^ x >> 31; return x; } -static inline unsigned zigzagdec31(unsigned x) { return (x >> 2 | (x& 2)<<30 ) ^ -(x & 1); } +static inline unsigned zigzagenc31(int x) { x = (x << 2 | ((x>>30)& 2)) ^ x >> 31; return x; } +static inline unsigned zigzagdec31(unsigned x) { return (x >> 2 | (x& 2)<<30 ) ^ -(x & 1); } -static inline unsigned short zigzagenc16(short x) { return x << 1 ^ x >> 31; } +static inline unsigned short zigzagenc16(short x) { return x << 1 ^ x >> 15; } static inline unsigned short zigzagdec16(unsigned short x) { return x >> 1 ^ -(x & 1); } -static inline unsigned char zigzagenc8(char x) { return x << 1 ^ x >> 31; } -static inline unsigned char zigzagdec8(unsigned short x) { return x >> 1 ^ -(x & 1); } +static inline unsigned char zigzagenc8( char x) { return x << 1 ^ x >> 7; } +static inline unsigned char zigzagdec8( unsigned short x) { return x >> 1 ^ -(x & 1); } + + #ifdef __AVX2__ +#include +#include +//#define DELTA256x32(_v_, _sv_,_iv_) ? + +#define SCAN256x32( _v_, _sv_) {\ + _v_ = _mm256_add_epi32(_v_, _mm256_slli_si256(_v_, 4));\ + _v_ = _mm256_add_epi32(_v_, _mm256_slli_si256(_v_, 8));\ + _sv_ = _mm256_add_epi32( _mm256_permute2x128_si256( _mm256_shuffle_epi32(_sv_,_MM_SHUFFLE(3, 3, 3, 3)), _sv_, 0x11), \ + _mm256_add_epi32(_v_, _mm256_permute2x128_si256(zv,_mm256_shuffle_epi32(_v_, _MM_SHUFFLE(3, 3, 3, 3)), 0x20)));\ +} + +#define SCANI256x32(_v_, _sv_, _vi_) SCAN256x32(_v_, _sv_); _sv_ = _mm256_add_epi32(_sv_, _vi_) + +#define ZIGZAG256x32(_v_) _mm256_xor_si256(_mm256_slli_epi32(_v_,1), _mm256_srai_epi32(_v_,31)) +#define UNZIGZAG256x32(_v_) _mm256_xor_si256(_mm256_srli_epi32(_v_,1), _mm256_srai_epi32(_mm256_slli_epi32(_v_,31),31) ) + +#define HOR256x32(_v_,_b_) _v_ = _mm256_or_si256(_v_, _mm256_srli_si256(_v_, 8)); _v_ = _mm256_or_s256(_v_, _mm256_srli_si256(_v_, 4));\ + _b_ = _mm256_extract_epi32(_v_,0) | _mm256_extract_epi32(_v_, 4) + #endif #ifdef __SSE2__ #include -// SIMD Delta -#define DELTA128_32(_v_, _sv_) _mm_sub_epi32(_v_, _mm_or_si128(_mm_srli_si128(_sv_, 12), _mm_slli_si128(_v_, 4))) +#define DELTA128x32(_v_, _sv_) _mm_sub_epi32(_v_, _mm_or_si128(_mm_srli_si128(_sv_, 12), _mm_slli_si128(_v_, 4))) // SIMD Scan ( prefix sum ) -#define SCAN128_32( _v_, _sv_) _v_ = _mm_add_epi32(_v_, _mm_slli_si128(_v_, 4)); _sv_ = _mm_add_epi32(_mm_shuffle_epi32(_sv_, _MM_SHUFFLE(3, 3, 3, 3)), _mm_add_epi32(_mm_slli_si128(_v_, 8), _v_) ) -#define SCANI128_32(_v_, _sv_, _vi_) SCAN128_32(_v_, _sv_); _sv_ = _mm_add_epi32(_sv_, _vi_) - -// SIMD ZigZag -#define ZIGZAG128_32(_v_) _mm_xor_si128(_mm_slli_epi32(_v_,1), _mm_srai_epi32(_v_,31)) -#define UNZIGZAG128_32(_v_) _mm_xor_si128(_mm_srli_epi32(_v_,1), _mm_srai_epi32(_mm_slli_epi32(_v_,31),31) ) //_mm_sub_epi32(cz, _mm_and_si128(iv,c1)) +#define SCAN128x32( _v_, _sv_) _v_ = _mm_add_epi32(_v_, _mm_slli_si128(_v_, 4)); _sv_ = _mm_add_epi32(_mm_shuffle_epi32(_sv_, _MM_SHUFFLE(3, 3, 3, 3)), _mm_add_epi32(_mm_slli_si128(_v_, 8), _v_) ) +#define SCANI128x32(_v_, _sv_, _vi_) SCAN128x32(_v_, _sv_); _sv_ = _mm_add_epi32(_sv_, _vi_) +#define ZIGZAG128x32(_v_) _mm_xor_si128(_mm_slli_epi32(_v_,1), _mm_srai_epi32(_v_,31)) +#define UNZIGZAG128x32(_v_) _mm_xor_si128(_mm_srli_epi32(_v_,1), _mm_srai_epi32(_mm_slli_epi32(_v_,31),31) ) //_mm_sub_epi32(cz, _mm_and_si128(iv,c1)) // SIMD Horizontal OR -#define HOR128_32(_v_,_b_) _v_ = _mm_or_si128(_v_, _mm_srli_si128(_v_, 8)); _v_ = _mm_or_si128(_v_, _mm_srli_si128(_v_, 4)); _b_ = (unsigned)_mm_cvtsi128_si32(_v_) +#define HOR128x32(_v_,_b_) _v_ = _mm_or_si128(_v_, _mm_srli_si128(_v_, 8)); _v_ = _mm_or_si128(_v_, _mm_srli_si128(_v_, 4)); _b_ = (unsigned)_mm_cvtsi128_si32(_v_) + #endif + #if 0 //def __AVX2__ +#define BITSIZE32(_in_, _n_, _b_) { typeof(_in_[0]) *_ip; __m256i _v = _mm256_setzero_si256();\ + for(_ip = _in_; _ip != _in_+(_n_&~(8-1)); _ip+=8)\ + _v = _mm256_or_si256(_v, _mm256_loadu_si256((__m256i*)_ip));\ + HOR256x32(_v,_b_);\ + while(_ip != _in_+_n_)\ + _b_ |= *_ip++;\ + _b_ = bsr32(_b_);\ +} +#define BITZERO32(_out_, _n_, _start_) do {\ + __m256i _sv_ = _mm256_set1_epi32(_start_), *_ov = (__m256i *)(_out_), *_ove = (__m256i *)(_out_ + _n_);\ + do _mm256_storeu_si256(_ov++, _sv_); while(_ov < _ove);\ +} while(0) + +#define BITFORZERO32(_out_, _n_, _start_, _inc_) do {\ + __m256i _sv = _mm256_set1_epi32(_start_), *_ov=(__m256i *)(_out_), *_ove = (__m256i *)(_out_ + _n_), _cv = _mm256_set_epi32(7+_inc_,6+_inc_,5+_inc_,4+_inc_,3*_inc_,2*_inc_,1*_inc_,0); \ + _sv = _mm256_add_epi32(_sv, _cv);\ + _cv = _mm256_set1_epi32(4);\ + do { _mm256_storeu_si256(_ov++, _sv); _sv = _mm256_add_epi32(_sv, _cv); } while(_ov < _ove);\ +} while(0) + +#define BITDIZERO32(_out_, _n_, _start_, _inc_) do { __m256i _sv = _mm256_set1_epi32(_start_), _cv = _mm256_set_epi32(7+_inc_,6+_inc_,5+_inc_,4+_inc_,3+_inc_,2+_inc_,1+_inc_,_inc_), *_ov=(__m256i *)(_out_), *_ove = (__m256i *)(_out_ + _n_);\ + _sv = _mm256_add_epi32(_sv, _cv); _cv = _mm256_set1_epi32(4*_inc_); do { _mm256_storeu_si256(_ov++, _sv), _sv = _mm256_add_epi32(_sv, _cv); } while(_ov < _ove);\ +} while(0) + + #elif defined(__SSE2__) #define BITSIZE32(_in_, _n_, _b_) { typeof(_in_[0]) *_ip; __m128i _v = _mm_setzero_si128();\ for(_ip = _in_; _ip != _in_+(_n_&~(4-1)); _ip+=4)\ _v = _mm_or_si128(_v, _mm_loadu_si128((__m128i*)_ip));\ - HOR128_32(_v,_b_);\ + HOR128x32(_v,_b_);\ while(_ip != _in_+_n_)\ _b_ |= *_ip++;\ _b_ = bsr32(_b_);\ @@ -100,9 +145,9 @@ static inline unsigned char zigzagdec8(unsigned short x) { return x >> 1 ^ -(x & } while(0) #else -#define BITSIZE32(_in_, _n_, _b_) BITSIZE(_in_, _n_, _b_, 32) -#define BITFORZERO32(_out_, _n_, _start_, _inc_) _BITFORZERO(_out_, _n_, _start_, _inc_) -#define BITZERO32(_out_, _n_, _start_) _BITFORZERO(_out_, _n_, _start_, 0) +#define BITSIZE32( _in_, _n_, _b_) BITSIZE_(_in_, _n_, _b_, 32) +#define BITFORZERO32(_out_, _n_, _start_, _inc_) BITFORSET_(_out_, _n_, _start_, _inc_) +#define BITZERO32( _out_, _n_, _start_) BITFORSET_(_out_, _n_, _start_, 0) #endif #define DELTR( _in_, _n_, _mode_, _out_) { unsigned _v; for( _out_[0]=_in_[0],_v = 1; _v < _n_; _v++) _out_[_v] = (_in_[_v] - _out_[0]) - _v*_mode_; } @@ -161,13 +206,13 @@ void bitunzigzag64( uint64_t *p, unsigned n, unsigned start); #define DZMANT_BITS 36 -#define FLTEXPO(__u,__mantbits, __one) ( ((__u) >> __mantbits) & ( (__one<<(sizeof(__u)*8 - __mantbits)) - 1 ) ) -#define FLTMANT(__u,__mantbits, __one) ((__u) & ((__one<<__mantbits)-1)) +#define FLTEXPO(_u_,_mantbits_, _one_) ( ((_u_) >> _mantbits_) & ( (_one_<<(sizeof(_u_)*8 - _mantbits_)) - 1 ) ) +#define FLTMANT(_u_,_mantbits_, _one_) ((_u_) & ((_one_<<_mantbits_)-1)) -#define BITUNFLOAT(__expo, __mant, __u, __mantbits) __u = ((__expo) << __mantbits) | (__mant)//>>1 | (__mant)<<(sizeof(__u)*8 - 1) +#define BITUNFLOAT(_expo_, _mant_, _u_, _mantbits_) _u_ = ((_expo_) << _mantbits_) | (_mant_)//>>1 | (_mant_)<<(sizeof(_u_)*8 - 1) -/*#define BITFLOAT(__u, __sgn, __expo, __mant, __mantbits, __one) __sgn = __u >> (sizeof(__u)*8-1); __expo = EXPO(__u,__mantbits; __mant = __u & ((__one<<__mantbits)-1) -#define BITUNFLOAT( __sgn, __expo, __mant, __u, __mantbits) __u = (__sgn) << (sizeof(__u)*8-1) | (__expo) << __mantbits | (__mant) */ +/*#define BITFLOAT(_u_, _sgn_, _expo_, _mant_, _mantbits_, _one_) _sgn_ = _u_ >> (sizeof(_u_)*8-1); _expo_ = EXPO(_u_,_mantbits_; _mant_ = _u_ & ((_one_<<_mantbits_)-1) +#define BITUNFLOAT( _sgn_, _expo_, _mant_, _u_, _mantbits_) _u_ = (_sgn_) << (sizeof(_u_)*8-1) | (_expo_) << _mantbits_ | (_mant_) */ // De-/Compose floating point array to/from integer arrays (sign,exponent,mantissa) for using with "Integer Compression" functions ------------ void bitdouble( double *in, unsigned n, int *expo, uint64_t *mant); diff --git a/conf.h b/conf.h index 21d33dd..d48f8db 100644 --- a/conf.h +++ b/conf.h @@ -1,5 +1,5 @@ /** - Copyright (C) powturbo 2013-2016 + Copyright (C) powturbo 2013-2017 GPL v2 License This program is free software; you can redistribute it and/or modify @@ -38,6 +38,8 @@ #define popcnt64(_x_) __builtin_popcountll(_x_) #if defined(__i386__) || defined(__x86_64__) +//__bsr32 1:0,2:1,3:1,4:2,5:2,6:2,7:2,8:3,9:3,10:3,11:3,12:3,13:3,14:3,15:3,16:4,17:4,18:4,19:4,20:4,21:4,22:4,23:4,24:4,25:4,26:4,27:4,28:4,29:4,30:4,31:4,32:5 +//bsr32: 0:0,1:1,2:2,3:2,4:3,5:3,6:3,7:3,8:4,9:4,10:4,11:4,12:4,13:4,14:4,15:4,16:5,17:5,18:5,19:5,20:5,21:5,22:5,23:5,24:5,25:5,26:5,27:5,28:5,29:5,30:5,31:5,32:6, static inline int __bsr32( int x) { asm("bsr %1,%0" : "=r" (x) : "rm" (x) ); return x; } static inline int bsr32( int x) { int b = -1; asm("bsrl %1,%0" : "+r" (b) : "rm" (x) ); return b + 1; } static inline int bsr64(unsigned long long x) { return x?64 - __builtin_clzll(x):0; } @@ -47,6 +49,7 @@ static inline unsigned rol32(unsigned x, int s) { asm ("roll %%cl,%0" :"=r" (x) static inline unsigned ror32(unsigned x, int s) { asm ("rorl %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; } #else +static inline int __bsr32(unsigned x ) { return 31 - __builtin_clz( x); } static inline int bsr32(int x ) { return x?32 - __builtin_clz( x):0; } static inline int bsr64(unsigned long long x) { return x?64 - __builtin_clzll(x):0; } @@ -59,15 +62,15 @@ static inline unsigned ror32(unsigned x, int s) { return x >> s | x << (32 - s); #define clz64(_x_) __builtin_clzll(_x_) #define clz32(_x_) __builtin_clz(_x_) -#if __GNUC_MINOR__ < 8 -static inline unsigned short bswap16(unsigned short a) { return (a<<8)|(a>>8); } -#else +#if __GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 8 #define bswap16(x) __builtin_bswap16(x) +#else +static inline unsigned short bswap16(unsigned short) { return __builtin_bswap32(x << 16); } #endif #define bswap32(x) __builtin_bswap32(x) #define bswap64(x) __builtin_bswap64(x) - #elif _MSC_VER + #elif _MSC_VER //---------------------------------------------------- #define ALIGNED(x) __declspec(align(x)) #define ALWAYS_INLINE __forceinline #define NOINLINE __declspec(noinline) @@ -100,7 +103,18 @@ static inline int ctz32(unsigned x) { unsigned z = 0; _BitScanRev #endif #define ctz16(_x_) ctz32(_x_) -#define clz16(_x_) clz32(_x_) +#define clz16(_x_) (clz32(_x_)-16) + + #ifdef __AVX2__ +//#include +#include +#define bzhi64(_u_, _b_) _bzhi_u64(_u_, _b_) +#define bzhi32(_u_, _b_) _bzhi_u32(_u_, _b_) + #else +#define bzhi64(_u_, _b_) ((_u_) & ((1ull<<(_b_))-1)) +#define bzhi32(_u_, _b_) ((_u_) & ((1u <<(_b_))-1)) + #endif + //--------------- Unaligned memory access ------------------------------------- /*# || defined(i386) || defined(_X86_) || defined(__THW_INTEL)*/ #if defined(__i386__) || defined(__x86_64__) || \ @@ -110,12 +124,14 @@ static inline int ctz32(unsigned x) { unsigned z = 0; _BitScanRev defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__) || \ defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__) || \ defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) -#define ctou16(_cp_) *(unsigned short *)(_cp_) -#define ctou32(_cp_) *(unsigned *)(_cp_) +#define ctou16(_cp_) (*(unsigned short *)(_cp_)) +#define ctou32(_cp_) (*(unsigned *)(_cp_)) #if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) #define ctou64(_cp_) (*(unsigned long long *)(_cp_)) -#define ctou(_cp_t, _cp_) (*(_cp_t *)(_cp_)) + #elif defined(__ARM_FEATURE_UNALIGNED) +struct _PACKED longu { unsigned long long l; }; +#define ctou64(_cp_) ((struct longu *)(_cp_))->l #endif #elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7S__) @@ -131,24 +147,24 @@ struct _PACKED longu { unsigned long long l; }; #endif #ifdef ctou16 -#define utoc16(_x_,_cp_) ctou16(_cp_) = _x_ +//#define utoc16(_x_,_cp_) ctou16(_cp_) = _x_ #else static inline unsigned short ctou16(const void *cp) { unsigned short x; memcpy(&x, cp, sizeof(x)); return x; } -static inline void utoc16(unsigned short x, void *cp ) { memcpy(cp, &x, sizeof(x)); } +//static inline void utoc16(unsigned short x, void *cp ) { memcpy(cp, &x, sizeof(x)); } #endif #ifdef ctou32 -#define utoc32(_x_,_cp_) ctou32(_cp_) = _x_ +//#define utoc32(_x_,_cp_) ctou32(_cp_) = _x_ #else static inline unsigned ctou32(const void *cp) { unsigned x; memcpy(&x, cp, sizeof(x)); return x; } -static inline void utoc32(unsigned x, void *cp ) { memcpy(cp, &x, sizeof(x)); } +//static inline void utoc32(unsigned x, void *cp ) { memcpy(cp, &x, sizeof(x)); } #endif #ifdef ctou64 -#define utoc64(_x_,_cp_) ctou64(_cp_) = _x_ +//#define utoc64(_x_,_cp_) ctou64(_cp_) = _x_ #else static inline unsigned long long ctou64(const void *cp) { unsigned long long x; memcpy(&x, cp, sizeof(x)); return x; } -static inline void utoc64(unsigned long long x, void *cp ) { memcpy(cp, &x, sizeof(x)); } +//static inline void utoc64(unsigned long long x, void *cp ) { memcpy(cp, &x, sizeof(x)); } #endif #define ctou24(_cp_) (ctou32(_cp_) & 0xffffff) diff --git a/eliasfano.c b/eliasfano.c index e13ee9c..fbd58bf 100644 --- a/eliasfano.c +++ b/eliasfano.c @@ -1,5 +1,5 @@ /** - Copyright (C) powturbo 2013-2015 + Copyright (C) powturbo 2013-2017 GPL v2 License This program is free software; you can redistribute it and/or modify @@ -35,7 +35,13 @@ #include "bitunpack.h" #include "bitutil.h" #include "eliasfano.h" - + #ifdef __SSE42__ +static inline unsigned long long blsr(unsigned long long x) { unsigned long long r; asm ("blsrq %1, %0" : "=r" (r) : "r" (x)); return r; } + #else +static inline unsigned long long blsr(unsigned long long x) { return x & (x - 1); } +//#define blsr(_x_) (_x_ & (_x_ - 1)) + #endif + #define bit_t unsigned long long #define EFE(__x,__i,__start) ((__x[__i] - __start)-(__i)*EF_INC) @@ -84,11 +90,12 @@ #undef EFANODEC //---------------------- -#define BITPACK bitpackv -#define BITUNPACK bitunpackv +#define EFSIMD +#define BITPACK bitpack128v +#define BITUNPACK bitunpack128v #define EF_INC 1 -#define EFANOENC efano1encv -#define EFANODEC efano1decv +#define EFANOENC efano1enc128v +#define EFANODEC efano1dec128v #define USIZE 32 #include __FILE__ @@ -104,12 +111,13 @@ //------------------------------------------ #define EF_INC 0 -#define EFANOENC efanoencv -#define EFANODEC efanodecv +#define EFANOENC efanoenc128v +#define EFANODEC efanodec128v #define USIZE 32 #include __FILE__ #undef USIZE +#undef EFSIMD /*#define USIZE 16 #include __FILE__ @@ -136,7 +144,12 @@ unsigned char *TEMPLATE2(EFANOENC, USIZE)(uint_t *__restrict in, unsigned n, uns pa[i] = EFE(in,i,start) & x; ++i; } while(i < n) pa[i] = EFE(in,i,start) & x, ++i; - *out = lb+1; op = TEMPLATE2(BITPACK,USIZE)(pa, n, out+1, lb); + *out = lb+1; + op = TEMPLATE2(BITPACK,USIZE)(pa, + #ifndef EFSIMD + n, + #endif + out+1, lb); memset(op, 0, hl); for(i = 0; i != n&~3; ) { @@ -163,18 +176,27 @@ unsigned char *TEMPLATE2(EFANODEC, USIZE)(unsigned char *__restrict in, unsigned BITZERO32( out, n, start); #endif #else - _BITFORZERO(out, n, start, EF_INC); + BITFORSET_(out, n, start, EF_INC); #endif return ip; } - ip = TEMPLATE2(BITUNPACK,USIZE)(ip, n, out, --lb); + ip = TEMPLATE2(BITUNPACK,USIZE)(ip, + #ifndef EFSIMD + n, + #endif + out, --lb); for(i=j=0;; j += sizeof(bit_t)*8) - for(b = *(bit_t *)(ip+(j>>3)); b; b &= b-1) { - out[i] = ((uint_t)(j+__builtin_ctzll(b)-i) << lb | out[i]) + start+i*EF_INC; - if(unlikely(++i >= n)) - return ip + PAD8((EFE(out,n-1,start)>>lb)+n); + for(b = *(bit_t *)(ip+(j>>3)); ; ) { + if(!b) break; out[i] += ((uint_t)(j+ctz64(b)-i) << lb) + start+i*EF_INC; b = blsr(b); i++; + if(!b) break; out[i] += ((uint_t)(j+ctz64(b)-i) << lb) + start+i*EF_INC; b = blsr(b); i++; + if(!b) break; out[i] += ((uint_t)(j+ctz64(b)-i) << lb) + start+i*EF_INC; b = blsr(b); i++; + if(!b) break; out[i] += ((uint_t)(j+ctz64(b)-i) << lb) + start+i*EF_INC; b = blsr(b); i++; + if(!b) break; out[i] += ((uint_t)(j+ctz64(b)-i) << lb) + start+i*EF_INC; b = blsr(b); i++; + if(!b) break; out[i] += ((uint_t)(j+ctz64(b)-i) << lb) + start+i*EF_INC; i++; + if(unlikely(i >= n)) goto e; b = blsr(b); } + e:return ip + PAD8((EFE(out,n-1,start)>>lb)+n); } #pragma clang diagnostic pop #endif diff --git a/eliasfano.h b/eliasfano.h index 2220492..cfb768a 100644 --- a/eliasfano.h +++ b/eliasfano.h @@ -1,5 +1,5 @@ /** - Copyright (C) powturbo 2013-2015 + Copyright (C) powturbo 2013-2017 GPL v2 License This program is free software; you can redistribute it and/or modify @@ -28,23 +28,23 @@ extern "C" { #include // compress/decompress integer array with n values to the buffer out. Return value = end of output/input buffer -unsigned char *efanoenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); -unsigned char *efanoenc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start); +unsigned char *efanoenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); +unsigned char *efanoenc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start); -unsigned char *efanodec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); -unsigned char *efanodec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start); +unsigned char *efanodec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); +unsigned char *efanodec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start); -unsigned char *efano1enc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); -unsigned char *efano1enc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start); +unsigned char *efano1enc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); +unsigned char *efano1enc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start); -unsigned char *efano1dec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); -unsigned char *efano1dec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start); +unsigned char *efano1dec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); +unsigned char *efano1dec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start); -unsigned char *efanoencv32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); -unsigned char *efanodecv32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); +unsigned char *efanoenc128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); +unsigned char *efanodec128v32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); -unsigned char *efano1encv32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); -unsigned char *efano1decv32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); +unsigned char *efano1enc128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); +unsigned char *efano1dec128v32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); #ifdef __cplusplus } diff --git a/ext/FastPFor b/ext/FastPFor new file mode 160000 index 0000000..d259705 --- /dev/null +++ b/ext/FastPFor @@ -0,0 +1 @@ +Subproject commit d259705c9dc8e6606f6be54ff069f354b2fc1fae diff --git a/ext/LittleIntPacker b/ext/LittleIntPacker new file mode 160000 index 0000000..b1d9865 --- /dev/null +++ b/ext/LittleIntPacker @@ -0,0 +1 @@ +Subproject commit b1d98653a31ccdb466804ab3e0c37bc090f2cd76 diff --git a/ext/MaskedVByte b/ext/MaskedVByte new file mode 160000 index 0000000..e9f010c --- /dev/null +++ b/ext/MaskedVByte @@ -0,0 +1 @@ +Subproject commit e9f010cdf19aaa0dc4c2b6d23ac310e2db55e1b9 diff --git a/ext/MaskedVByte/LICENSE b/ext/MaskedVByte/LICENSE deleted file mode 100644 index e06d208..0000000 --- a/ext/MaskedVByte/LICENSE +++ /dev/null @@ -1,202 +0,0 @@ -Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "{}" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright {yyyy} {name of copyright owner} - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - diff --git a/ext/MaskedVByte/include/varintdecode.h b/ext/MaskedVByte/include/varintdecode.h deleted file mode 100644 index fbdb049..0000000 --- a/ext/MaskedVByte/include/varintdecode.h +++ /dev/null @@ -1,28 +0,0 @@ - -#ifndef VARINTDECODE_H_ -#define VARINTDECODE_H_ -#define __STDC_FORMAT_MACROS -#include -#include // please use a C99-compatible compiler -#include - -// This function must be called once to initialized tables before using the other functions below -void simdvbyteinit(void); - -// Read "length" 32-bit integers in varint format from in, storing the result in out. Returns the number of bytes read. -size_t masked_vbyte_decode(const uint8_t* in, uint32_t* out, uint64_t length); - -// Read "length" 32-bit integers in varint format from in, storing the result in out with differential coding starting at prev. Setting prev to zero is a good default. Returns the number of bytes read. -size_t masked_vbyte_decode_delta(const uint8_t* in, uint32_t* out, uint64_t length, uint32_t prev); - -// Read 32-bit integers in varint format from in, reading inputsize bytes, storing the result in out. Returns the number of integers read. -size_t masked_vbyte_decode_fromcompressedsize(const uint8_t* in, uint32_t* out, - size_t inputsize); - -// Read 32-bit integers in varint format from in, reading inputsize bytes, storing the result in out with differential coding starting at prev. Setting prev to zero is a good default. Returns the number of integers read. -size_t masked_vbyte_decode_fromcompressedsize_delta(const uint8_t* in, uint32_t* out, - size_t inputsize, uint32_t prev); - - - -#endif /* VARINTDECODE_H_ */ diff --git a/ext/MaskedVByte/include/varintencode.h b/ext/MaskedVByte/include/varintencode.h deleted file mode 100644 index 2183968..0000000 --- a/ext/MaskedVByte/include/varintencode.h +++ /dev/null @@ -1,18 +0,0 @@ - -#ifndef VARINTENCODE_H_ -#define VARINTENCODE_H_ - -#include // please use a C99-compatible compiler -#include - -// Encode an array of a given length read from in to bout in varint format. -// Returns the number of bytes written. -size_t vbyte_encode(uint32_t *in, size_t length, uint8_t *bout); - -// Encode an array of a given length read from in to bout in varint format with differential -// coding starting at value prev. (Setting prev to 0 is a good default.) -// Returns the number of bytes written. -size_t vbyte_encode_delta(uint32_t *in, size_t length, uint8_t *bout, uint32_t prev); - - -#endif /* VARINTENCODE_H_ */ diff --git a/ext/MaskedVByte/src/varintdecode.c b/ext/MaskedVByte/src/varintdecode.c deleted file mode 100644 index 6f43732..0000000 --- a/ext/MaskedVByte/src/varintdecode.c +++ /dev/null @@ -1,1760 +0,0 @@ -#include "../include/varintdecode.h" - -#include - -static const uint8_t vec_lookup[] __attribute__((aligned(0x1000))) = { 0, 32, - 16, 118, 8, 48, 82, 160, 4, 40, 24, 127, 70, 109, 148, 165, 2, 36, 20, - 121, 12, 56, 85, 161, 66, 97, 79, 136, 145, 153, 149, 0, 1, 34, 18, 119, - 10, 52, 83, 160, 6, 44, 28, 130, 71, 112, 148, 166, 64, 93, 75, 124, 69, - 106, 88, 162, 145, 150, 146, 158, 145, 154, 0, 0, 0, 33, 17, 118, 9, 50, - 82, 160, 5, 42, 26, 128, 70, 110, 148, 165, 3, 38, 22, 122, 14, 60, 86, - 161, 66, 98, 80, 139, 145, 153, 149, 0, 64, 91, 73, 120, 67, 102, 84, - 160, 65, 96, 78, 133, 72, 115, 148, 167, 64, 150, 146, 155, 145, 151, - 147, 163, 145, 150, 146, 159, 0, 0, 0, 0, 0, 32, 16, 118, 8, 49, 82, - 160, 4, 41, 25, 127, 70, 109, 148, 165, 2, 37, 21, 121, 13, 58, 85, 161, - 66, 97, 79, 137, 145, 153, 149, 0, 1, 35, 19, 119, 11, 54, 83, 160, 7, - 46, 30, 131, 71, 113, 148, 166, 64, 93, 75, 125, 69, 107, 89, 162, 145, - 150, 146, 158, 145, 154, 0, 0, 0, 91, 73, 118, 67, 100, 82, 160, 65, 94, - 76, 129, 70, 111, 148, 165, 64, 92, 74, 123, 68, 105, 87, 161, 66, 99, - 81, 142, 145, 153, 149, 0, 64, 91, 73, 155, 67, 151, 147, 160, 65, 150, - 146, 156, 145, 152, 148, 168, 64, 150, 146, 155, 145, 151, 147, 164, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 32, 16, 118, 8, 48, 82, 160, 4, 40, 24, 127, 70, - 109, 148, 165, 2, 36, 20, 121, 12, 57, 85, 161, 66, 97, 79, 136, 145, - 153, 149, 0, 1, 34, 18, 119, 10, 53, 83, 160, 6, 45, 29, 130, 71, 112, - 148, 166, 64, 93, 75, 124, 69, 106, 88, 162, 145, 150, 146, 158, 145, - 154, 0, 0, 0, 33, 17, 118, 9, 51, 82, 160, 5, 43, 27, 128, 70, 110, 148, - 165, 3, 39, 23, 122, 15, 62, 86, 161, 66, 98, 80, 140, 145, 153, 149, 0, - 64, 91, 73, 120, 67, 102, 84, 160, 65, 96, 78, 134, 72, 116, 148, 167, - 64, 150, 146, 155, 145, 151, 147, 163, 145, 150, 146, 159, 0, 0, 0, 0, - 0, 32, 16, 118, 8, 100, 82, 160, 4, 94, 76, 127, 70, 109, 148, 165, 2, - 92, 74, 121, 68, 103, 85, 161, 66, 97, 79, 138, 145, 153, 149, 0, 1, 91, - 73, 119, 67, 101, 83, 160, 65, 95, 77, 132, 71, 114, 148, 166, 64, 93, - 75, 126, 69, 108, 90, 162, 145, 150, 146, 158, 145, 154, 0, 0, 0, 91, - 73, 118, 67, 100, 82, 160, 65, 94, 76, 156, 70, 152, 148, 165, 64, 92, - 74, 155, 68, 151, 147, 161, 66, 150, 146, 157, 145, 153, 149, 0, 64, 91, - 73, 155, 67, 151, 147, 160, 65, 150, 146, 156, 145, 152, 148, 169, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 16, 118, 8, 48, 82, - 160, 4, 40, 24, 127, 70, 109, 148, 165, 2, 36, 20, 121, 12, 56, 85, 161, - 66, 97, 79, 136, 145, 153, 149, 0, 1, 34, 18, 119, 10, 52, 83, 160, 6, - 44, 28, 130, 71, 112, 148, 166, 64, 93, 75, 124, 69, 106, 88, 162, 145, - 150, 146, 158, 145, 154, 0, 0, 0, 33, 17, 118, 9, 50, 82, 160, 5, 42, - 26, 128, 70, 110, 148, 165, 3, 38, 22, 122, 14, 61, 86, 161, 66, 98, 80, - 139, 145, 153, 149, 0, 64, 91, 73, 120, 67, 102, 84, 160, 65, 96, 78, - 133, 72, 115, 148, 167, 64, 150, 146, 155, 145, 151, 147, 163, 145, 150, - 146, 159, 0, 0, 0, 0, 0, 32, 16, 118, 8, 49, 82, 160, 4, 41, 25, 127, - 70, 109, 148, 165, 2, 37, 21, 121, 13, 59, 85, 161, 66, 97, 79, 137, - 145, 153, 149, 0, 1, 35, 19, 119, 11, 55, 83, 160, 7, 47, 31, 131, 71, - 113, 148, 166, 64, 93, 75, 125, 69, 107, 89, 162, 145, 150, 146, 158, - 145, 154, 0, 0, 0, 91, 73, 118, 67, 100, 82, 160, 65, 94, 76, 129, 70, - 111, 148, 165, 64, 92, 74, 123, 68, 105, 87, 161, 66, 99, 81, 143, 145, - 153, 149, 0, 64, 91, 73, 155, 67, 151, 147, 160, 65, 150, 146, 156, 145, - 152, 148, 168, 64, 150, 146, 155, 145, 151, 147, 164, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 32, 16, 118, 8, 48, 82, 160, 4, 40, 24, 127, 70, 109, 148, 165, - 2, 36, 20, 121, 12, 103, 85, 161, 66, 97, 79, 136, 145, 153, 149, 0, 1, - 34, 18, 119, 10, 101, 83, 160, 6, 95, 77, 130, 71, 112, 148, 166, 64, - 93, 75, 124, 69, 106, 88, 162, 145, 150, 146, 158, 145, 154, 0, 0, 0, - 33, 17, 118, 9, 100, 82, 160, 5, 94, 76, 128, 70, 110, 148, 165, 3, 92, - 74, 122, 68, 104, 86, 161, 66, 98, 80, 141, 145, 153, 149, 0, 64, 91, - 73, 120, 67, 102, 84, 160, 65, 96, 78, 135, 72, 117, 148, 167, 64, 150, - 146, 155, 145, 151, 147, 163, 145, 150, 146, 159, 0, 0, 0, 0, 0, 32, 16, - 118, 8, 100, 82, 160, 4, 94, 76, 127, 70, 109, 148, 165, 2, 92, 74, 121, - 68, 103, 85, 161, 66, 97, 79, 157, 145, 153, 149, 0, 1, 91, 73, 119, 67, - 101, 83, 160, 65, 95, 77, 156, 71, 152, 148, 166, 64, 93, 75, 155, 69, - 151, 147, 162, 145, 150, 146, 158, 145, 154, 0, 0, 0, 91, 73, 118, 67, - 100, 82, 160, 65, 94, 76, 156, 70, 152, 148, 165, 64, 92, 74, 155, 68, - 151, 147, 161, 66, 150, 146, 157, 145, 153, 149, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 32, 16, 118, 8, 48, 82, 160, 4, 40, 24, 127, 70, 109, 148, 165, 2, - 36, 20, 121, 12, 56, 85, 161, 66, 97, 79, 136, 145, 153, 149, 0, 1, 34, - 18, 119, 10, 52, 83, 160, 6, 44, 28, 130, 71, 112, 148, 166, 64, 93, 75, - 124, 69, 106, 88, 162, 145, 150, 146, 158, 145, 154, 0, 0, 0, 33, 17, - 118, 9, 50, 82, 160, 5, 42, 26, 128, 70, 110, 148, 165, 3, 38, 22, 122, - 14, 60, 86, 161, 66, 98, 80, 139, 145, 153, 149, 0, 64, 91, 73, 120, 67, - 102, 84, 160, 65, 96, 78, 133, 72, 115, 148, 167, 64, 150, 146, 155, - 145, 151, 147, 163, 145, 150, 146, 159, 0, 0, 0, 0, 0, 32, 16, 118, 8, - 49, 82, 160, 4, 41, 25, 127, 70, 109, 148, 165, 2, 37, 21, 121, 13, 58, - 85, 161, 66, 97, 79, 137, 145, 153, 149, 0, 1, 35, 19, 119, 11, 54, 83, - 160, 7, 46, 30, 131, 71, 113, 148, 166, 64, 93, 75, 125, 69, 107, 89, - 162, 145, 150, 146, 158, 145, 154, 0, 0, 0, 91, 73, 118, 67, 100, 82, - 160, 65, 94, 76, 129, 70, 111, 148, 165, 64, 92, 74, 123, 68, 105, 87, - 161, 66, 99, 81, 142, 145, 153, 149, 0, 64, 91, 73, 155, 67, 151, 147, - 160, 65, 150, 146, 156, 145, 152, 148, 168, 64, 150, 146, 155, 145, 151, - 147, 164, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 16, 118, 8, 48, 82, 160, 4, 40, - 24, 127, 70, 109, 148, 165, 2, 36, 20, 121, 12, 57, 85, 161, 66, 97, 79, - 136, 145, 153, 149, 0, 1, 34, 18, 119, 10, 53, 83, 160, 6, 45, 29, 130, - 71, 112, 148, 166, 64, 93, 75, 124, 69, 106, 88, 162, 145, 150, 146, - 158, 145, 154, 0, 0, 0, 33, 17, 118, 9, 51, 82, 160, 5, 43, 27, 128, 70, - 110, 148, 165, 3, 39, 23, 122, 15, 63, 86, 161, 66, 98, 80, 140, 145, - 153, 149, 0, 64, 91, 73, 120, 67, 102, 84, 160, 65, 96, 78, 134, 72, - 116, 148, 167, 64, 150, 146, 155, 145, 151, 147, 163, 145, 150, 146, - 159, 0, 0, 0, 0, 0, 32, 16, 118, 8, 100, 82, 160, 4, 94, 76, 127, 70, - 109, 148, 165, 2, 92, 74, 121, 68, 103, 85, 161, 66, 97, 79, 138, 145, - 153, 149, 0, 1, 91, 73, 119, 67, 101, 83, 160, 65, 95, 77, 132, 71, 114, - 148, 166, 64, 93, 75, 126, 69, 108, 90, 162, 145, 150, 146, 158, 145, - 154, 0, 0, 0, 91, 73, 118, 67, 100, 82, 160, 65, 94, 76, 156, 70, 152, - 148, 165, 64, 92, 74, 155, 68, 151, 147, 161, 66, 150, 146, 157, 145, - 153, 149, 0, 64, 91, 73, 155, 67, 151, 147, 160, 65, 150, 146, 156, 145, - 152, 148, 169, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, - 16, 118, 8, 48, 82, 160, 4, 40, 24, 127, 70, 109, 148, 165, 2, 36, 20, - 121, 12, 56, 85, 161, 66, 97, 79, 136, 145, 153, 149, 0, 1, 34, 18, 119, - 10, 52, 83, 160, 6, 44, 28, 130, 71, 112, 148, 166, 64, 93, 75, 124, 69, - 106, 88, 162, 145, 150, 146, 158, 145, 154, 0, 0, 0, 33, 17, 118, 9, 50, - 82, 160, 5, 42, 26, 128, 70, 110, 148, 165, 3, 38, 22, 122, 14, 104, 86, - 161, 66, 98, 80, 139, 145, 153, 149, 0, 64, 91, 73, 120, 67, 102, 84, - 160, 65, 96, 78, 133, 72, 115, 148, 167, 64, 150, 146, 155, 145, 151, - 147, 163, 145, 150, 146, 159, 0, 0, 0, 0, 0, 32, 16, 118, 8, 49, 82, - 160, 4, 41, 25, 127, 70, 109, 148, 165, 2, 37, 21, 121, 13, 103, 85, - 161, 66, 97, 79, 137, 145, 153, 149, 0, 1, 35, 19, 119, 11, 101, 83, - 160, 7, 95, 77, 131, 71, 113, 148, 166, 64, 93, 75, 125, 69, 107, 89, - 162, 145, 150, 146, 158, 145, 154, 0, 0, 0, 91, 73, 118, 67, 100, 82, - 160, 65, 94, 76, 129, 70, 111, 148, 165, 64, 92, 74, 123, 68, 105, 87, - 161, 66, 99, 81, 144, 145, 153, 149, 0, 64, 91, 73, 155, 67, 151, 147, - 160, 65, 150, 146, 156, 145, 152, 148, 168, 64, 150, 146, 155, 145, 151, - 147, 164, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 16, 118, 8, 48, 82, 160, 4, 40, - 24, 127, 70, 109, 148, 165, 2, 36, 20, 121, 12, 103, 85, 161, 66, 97, - 79, 136, 145, 153, 149, 0, 1, 34, 18, 119, 10, 101, 83, 160, 6, 95, 77, - 130, 71, 112, 148, 166, 64, 93, 75, 124, 69, 106, 88, 162, 145, 150, - 146, 158, 145, 154, 0, 0, 0, 33, 17, 118, 9, 100, 82, 160, 5, 94, 76, - 128, 70, 110, 148, 165, 3, 92, 74, 122, 68, 104, 86, 161, 66, 98, 80, - 157, 145, 153, 149, 0, 64, 91, 73, 120, 67, 102, 84, 160, 65, 96, 78, - 156, 72, 152, 148, 167, 64, 150, 146, 155, 145, 151, 147, 163, 145, 150, - 146, 159, 0, 0, 0, 0, 0, 32, 16, 118, 8, 100, 82, 160, 4, 94, 76, 127, - 70, 109, 148, 165, 2, 92, 74, 121, 68, 103, 85, 161, 66, 97, 79, 157, - 145, 153, 149, 0, 1, 91, 73, 119, 67, 101, 83, 160, 65, 95, 77, 156, 71, - 152, 148, 166, 64, 93, 75, 155, 69, 151, 147, 162, 145, 150, 146, 158, - 145, 154, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, - 16, 118, 8, 48, 82, 160, 4, 40, 24, 127, 70, 109, 148, 165, 2, 36, 20, - 121, 12, 56, 85, 161, 66, 97, 79, 136, 145, 153, 149, 0, 1, 34, 18, 119, - 10, 52, 83, 160, 6, 44, 28, 130, 71, 112, 148, 166, 64, 93, 75, 124, 69, - 106, 88, 162, 145, 150, 146, 158, 145, 154, 0, 0, 0, 33, 17, 118, 9, 50, - 82, 160, 5, 42, 26, 128, 70, 110, 148, 165, 3, 38, 22, 122, 14, 60, 86, - 161, 66, 98, 80, 139, 145, 153, 149, 0, 64, 91, 73, 120, 67, 102, 84, - 160, 65, 96, 78, 133, 72, 115, 148, 167, 64, 150, 146, 155, 145, 151, - 147, 163, 145, 150, 146, 159, 0, 0, 0, 0, 0, 32, 16, 118, 8, 49, 82, - 160, 4, 41, 25, 127, 70, 109, 148, 165, 2, 37, 21, 121, 13, 58, 85, 161, - 66, 97, 79, 137, 145, 153, 149, 0, 1, 35, 19, 119, 11, 54, 83, 160, 7, - 46, 30, 131, 71, 113, 148, 166, 64, 93, 75, 125, 69, 107, 89, 162, 145, - 150, 146, 158, 145, 154, 0, 0, 0, 91, 73, 118, 67, 100, 82, 160, 65, 94, - 76, 129, 70, 111, 148, 165, 64, 92, 74, 123, 68, 105, 87, 161, 66, 99, - 81, 142, 145, 153, 149, 0, 64, 91, 73, 155, 67, 151, 147, 160, 65, 150, - 146, 156, 145, 152, 148, 168, 64, 150, 146, 155, 145, 151, 147, 164, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 32, 16, 118, 8, 48, 82, 160, 4, 40, 24, 127, 70, - 109, 148, 165, 2, 36, 20, 121, 12, 57, 85, 161, 66, 97, 79, 136, 145, - 153, 149, 0, 1, 34, 18, 119, 10, 53, 83, 160, 6, 45, 29, 130, 71, 112, - 148, 166, 64, 93, 75, 124, 69, 106, 88, 162, 145, 150, 146, 158, 145, - 154, 0, 0, 0, 33, 17, 118, 9, 51, 82, 160, 5, 43, 27, 128, 70, 110, 148, - 165, 3, 39, 23, 122, 15, 62, 86, 161, 66, 98, 80, 140, 145, 153, 149, 0, - 64, 91, 73, 120, 67, 102, 84, 160, 65, 96, 78, 134, 72, 116, 148, 167, - 64, 150, 146, 155, 145, 151, 147, 163, 145, 150, 146, 159, 0, 0, 0, 0, - 0, 32, 16, 118, 8, 100, 82, 160, 4, 94, 76, 127, 70, 109, 148, 165, 2, - 92, 74, 121, 68, 103, 85, 161, 66, 97, 79, 138, 145, 153, 149, 0, 1, 91, - 73, 119, 67, 101, 83, 160, 65, 95, 77, 132, 71, 114, 148, 166, 64, 93, - 75, 126, 69, 108, 90, 162, 145, 150, 146, 158, 145, 154, 0, 0, 0, 91, - 73, 118, 67, 100, 82, 160, 65, 94, 76, 156, 70, 152, 148, 165, 64, 92, - 74, 155, 68, 151, 147, 161, 66, 150, 146, 157, 145, 153, 149, 0, 64, 91, - 73, 155, 67, 151, 147, 160, 65, 150, 146, 156, 145, 152, 148, 169, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 16, 118, 8, 48, 82, - 160, 4, 40, 24, 127, 70, 109, 148, 165, 2, 36, 20, 121, 12, 56, 85, 161, - 66, 97, 79, 136, 145, 153, 149, 0, 1, 34, 18, 119, 10, 52, 83, 160, 6, - 44, 28, 130, 71, 112, 148, 166, 64, 93, 75, 124, 69, 106, 88, 162, 145, - 150, 146, 158, 145, 154, 0, 0, 0, 33, 17, 118, 9, 50, 82, 160, 5, 42, - 26, 128, 70, 110, 148, 165, 3, 38, 22, 122, 14, 61, 86, 161, 66, 98, 80, - 139, 145, 153, 149, 0, 64, 91, 73, 120, 67, 102, 84, 160, 65, 96, 78, - 133, 72, 115, 148, 167, 64, 150, 146, 155, 145, 151, 147, 163, 145, 150, - 146, 159, 0, 0, 0, 0, 0, 32, 16, 118, 8, 49, 82, 160, 4, 41, 25, 127, - 70, 109, 148, 165, 2, 37, 21, 121, 13, 59, 85, 161, 66, 97, 79, 137, - 145, 153, 149, 0, 1, 35, 19, 119, 11, 55, 83, 160, 7, 47, 31, 131, 71, - 113, 148, 166, 64, 93, 75, 125, 69, 107, 89, 162, 145, 150, 146, 158, - 145, 154, 0, 0, 0, 91, 73, 118, 67, 100, 82, 160, 65, 94, 76, 129, 70, - 111, 148, 165, 64, 92, 74, 123, 68, 105, 87, 161, 66, 99, 81, 143, 145, - 153, 149, 0, 64, 91, 73, 155, 67, 151, 147, 160, 65, 150, 146, 156, 145, - 152, 148, 168, 64, 150, 146, 155, 145, 151, 147, 164, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 32, 16, 118, 8, 48, 82, 160, 4, 40, 24, 127, 70, 109, 148, 165, - 2, 36, 20, 121, 12, 103, 85, 161, 66, 97, 79, 136, 145, 153, 149, 0, 1, - 34, 18, 119, 10, 101, 83, 160, 6, 95, 77, 130, 71, 112, 148, 166, 64, - 93, 75, 124, 69, 106, 88, 162, 145, 150, 146, 158, 145, 154, 0, 0, 0, - 33, 17, 118, 9, 100, 82, 160, 5, 94, 76, 128, 70, 110, 148, 165, 3, 92, - 74, 122, 68, 104, 86, 161, 66, 98, 80, 141, 145, 153, 149, 0, 64, 91, - 73, 120, 67, 102, 84, 160, 65, 96, 78, 135, 72, 117, 148, 167, 64, 150, - 146, 155, 145, 151, 147, 163, 145, 150, 146, 159, 0, 0, 0, 0, 0, 32, 16, - 118, 8, 100, 82, 160, 4, 94, 76, 127, 70, 109, 148, 165, 2, 92, 74, 121, - 68, 103, 85, 161, 66, 97, 79, 157, 145, 153, 149, 0, 1, 91, 73, 119, 67, - 101, 83, 160, 65, 95, 77, 156, 71, 152, 148, 166, 64, 93, 75, 155, 69, - 151, 147, 162, 145, 150, 146, 158, 145, 154, 0, 0, 0, 91, 73, 118, 67, - 100, 82, 160, 65, 94, 76, 156, 70, 152, 148, 165, 64, 92, 74, 155, 68, - 151, 147, 161, 66, 150, 146, 157, 145, 153, 149, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 32, 16, 118, 8, 48, 82, 160, 4, 40, 24, 127, 70, 109, 148, 165, 2, - 36, 20, 121, 12, 56, 85, 161, 66, 97, 79, 136, 145, 153, 149, 0, 1, 34, - 18, 119, 10, 52, 83, 160, 6, 44, 28, 130, 71, 112, 148, 166, 64, 93, 75, - 124, 69, 106, 88, 162, 145, 150, 146, 158, 145, 154, 0, 0, 0, 33, 17, - 118, 9, 50, 82, 160, 5, 42, 26, 128, 70, 110, 148, 165, 3, 38, 22, 122, - 14, 60, 86, 161, 66, 98, 80, 139, 145, 153, 149, 0, 64, 91, 73, 120, 67, - 102, 84, 160, 65, 96, 78, 133, 72, 115, 148, 167, 64, 150, 146, 155, - 145, 151, 147, 163, 145, 150, 146, 159, 0, 0, 0, 0, 0, 32, 16, 118, 8, - 49, 82, 160, 4, 41, 25, 127, 70, 109, 148, 165, 2, 37, 21, 121, 13, 58, - 85, 161, 66, 97, 79, 137, 145, 153, 149, 0, 1, 35, 19, 119, 11, 54, 83, - 160, 7, 46, 30, 131, 71, 113, 148, 166, 64, 93, 75, 125, 69, 107, 89, - 162, 145, 150, 146, 158, 145, 154, 0, 0, 0, 91, 73, 118, 67, 100, 82, - 160, 65, 94, 76, 129, 70, 111, 148, 165, 64, 92, 74, 123, 68, 105, 87, - 161, 66, 99, 81, 142, 145, 153, 149, 0, 64, 91, 73, 155, 67, 151, 147, - 160, 65, 150, 146, 156, 145, 152, 148, 168, 64, 150, 146, 155, 145, 151, - 147, 164, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 16, 118, 8, 48, 82, 160, 4, 40, - 24, 127, 70, 109, 148, 165, 2, 36, 20, 121, 12, 57, 85, 161, 66, 97, 79, - 136, 145, 153, 149, 0, 1, 34, 18, 119, 10, 53, 83, 160, 6, 45, 29, 130, - 71, 112, 148, 166, 64, 93, 75, 124, 69, 106, 88, 162, 145, 150, 146, - 158, 145, 154, 0, 0, 0, 33, 17, 118, 9, 51, 82, 160, 5, 43, 27, 128, 70, - 110, 148, 165, 3, 39, 23, 122, 15, 104, 86, 161, 66, 98, 80, 140, 145, - 153, 149, 0, 64, 91, 73, 120, 67, 102, 84, 160, 65, 96, 78, 134, 72, - 116, 148, 167, 64, 150, 146, 155, 145, 151, 147, 163, 145, 150, 146, - 159, 0, 0, 0, 0, 0, 32, 16, 118, 8, 100, 82, 160, 4, 94, 76, 127, 70, - 109, 148, 165, 2, 92, 74, 121, 68, 103, 85, 161, 66, 97, 79, 138, 145, - 153, 149, 0, 1, 91, 73, 119, 67, 101, 83, 160, 65, 95, 77, 132, 71, 114, - 148, 166, 64, 93, 75, 126, 69, 108, 90, 162, 145, 150, 146, 158, 145, - 154, 0, 0, 0, 91, 73, 118, 67, 100, 82, 160, 65, 94, 76, 156, 70, 152, - 148, 165, 64, 92, 74, 155, 68, 151, 147, 161, 66, 150, 146, 157, 145, - 153, 149, 0, 64, 91, 73, 155, 67, 151, 147, 160, 65, 150, 146, 156, 145, - 152, 148, 169, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, - 16, 118, 8, 48, 82, 160, 4, 40, 24, 127, 70, 109, 148, 165, 2, 36, 20, - 121, 12, 56, 85, 161, 66, 97, 79, 136, 145, 153, 149, 0, 1, 34, 18, 119, - 10, 52, 83, 160, 6, 44, 28, 130, 71, 112, 148, 166, 64, 93, 75, 124, 69, - 106, 88, 162, 145, 150, 146, 158, 145, 154, 0, 0, 0, 33, 17, 118, 9, 50, - 82, 160, 5, 42, 26, 128, 70, 110, 148, 165, 3, 38, 22, 122, 14, 104, 86, - 161, 66, 98, 80, 139, 145, 153, 149, 0, 64, 91, 73, 120, 67, 102, 84, - 160, 65, 96, 78, 133, 72, 115, 148, 167, 64, 150, 146, 155, 145, 151, - 147, 163, 145, 150, 146, 159, 0, 0, 0, 0, 0, 32, 16, 118, 8, 49, 82, - 160, 4, 41, 25, 127, 70, 109, 148, 165, 2, 37, 21, 121, 13, 103, 85, - 161, 66, 97, 79, 137, 145, 153, 149, 0, 1, 35, 19, 119, 11, 101, 83, - 160, 7, 95, 77, 131, 71, 113, 148, 166, 64, 93, 75, 125, 69, 107, 89, - 162, 145, 150, 146, 158, 145, 154, 0, 0, 0, 91, 73, 118, 67, 100, 82, - 160, 65, 94, 76, 129, 70, 111, 148, 165, 64, 92, 74, 123, 68, 105, 87, - 161, 66, 99, 81, 157, 145, 153, 149, 0, 64, 91, 73, 155, 67, 151, 147, - 160, 65, 150, 146, 156, 145, 152, 148, 168, 64, 150, 146, 155, 145, 151, - 147, 164, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 16, 118, 8, 48, 82, 160, 4, 40, - 24, 127, 70, 109, 148, 165, 2, 36, 20, 121, 12, 103, 85, 161, 66, 97, - 79, 136, 145, 153, 149, 0, 1, 34, 18, 119, 10, 101, 83, 160, 6, 95, 77, - 130, 71, 112, 148, 166, 64, 93, 75, 124, 69, 106, 88, 162, 145, 150, - 146, 158, 145, 154, 0, 0, 0, 33, 17, 118, 9, 100, 82, 160, 5, 94, 76, - 128, 70, 110, 148, 165, 3, 92, 74, 122, 68, 104, 86, 161, 66, 98, 80, - 157, 145, 153, 149, 0, 64, 91, 73, 120, 67, 102, 84, 160, 65, 96, 78, - 156, 72, 152, 148, 167, 64, 150, 146, 155, 145, 151, 147, 163, 145, 150, - 146, 159, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; - -static int read_int(const uint8_t* in, uint32_t* out) { - *out = in[0] & 0x7F; - if (in[0] < 128) { - return 1; - } - *out = ((in[1] & 0x7FU) << 7) | *out; - if (in[1] < 128) { - return 2; - } - *out = ((in[2] & 0x7FU) << 14) | *out; - if (in[2] < 128) { - return 3; - } - *out = ((in[3] & 0x7FU) << 21) | *out; - if (in[3] < 128) { - return 4; - } - *out = ((in[4] & 0x7FU) << 28) | *out; - return 5; -} - -static int read_int_delta(const uint8_t* in, uint32_t* out, uint32_t* prev) { - *out = in[0] & 0x7F; - if (in[0] < 128) { - *prev += *out; - *out = *prev; - return 1; - } - *out = ((in[1] & 0x7FU) << 7) | *out; - if (in[1] < 128) { - *prev += *out; - *out = *prev; - return 2; - } - *out = ((in[2] & 0x7FU) << 14) | *out; - if (in[2] < 128) { - *prev += *out; - *out = *prev; - return 3; - } - *out = ((in[3] & 0x7FU) << 21) | *out; - if (in[3] < 128) { - *prev += *out; - *out = *prev; - return 4; - } - *out = ((in[4] & 0x7FU) << 28) | *out; - *prev += *out; - *out = *prev; - return 5; -} - -static const uint8_t bytes_consumed[] = { 6, 7, 7, 6, 7, 8, 6, 5, 7, 8, 8, 7, 6, - 7, 5, 6, 7, 8, 8, 7, 8, 9, 7, 6, 6, 7, 7, 8, 2, 6, 6, 0, 7, 8, 8, 7, 8, - 9, 7, 5, 8, 9, 9, 8, 7, 8, 5, 7, 4, 7, 7, 8, 7, 8, 8, 7, 2, 3, 3, 7, 2, - 7, 0, 0, 6, 8, 8, 6, 8, 9, 6, 5, 8, 9, 9, 8, 6, 8, 5, 6, 8, 9, 9, 8, 9, - 10, 8, 6, 6, 8, 8, 9, 2, 6, 6, 0, 4, 5, 5, 8, 5, 8, 8, 5, 5, 8, 8, 9, 8, - 9, 5, 8, 4, 3, 3, 4, 2, 4, 4, 8, 2, 3, 3, 8, 2, 0, 0, 0, 6, 7, 7, 6, 7, - 9, 6, 5, 7, 9, 9, 7, 6, 7, 5, 6, 7, 9, 9, 7, 9, 10, 7, 6, 6, 7, 7, 9, 2, - 6, 6, 0, 7, 9, 9, 7, 9, 10, 7, 5, 9, 10, 10, 9, 7, 9, 5, 7, 4, 7, 7, 9, - 7, 9, 9, 7, 2, 3, 3, 7, 2, 7, 0, 0, 6, 5, 5, 6, 5, 6, 6, 5, 5, 6, 6, 9, - 6, 9, 5, 6, 4, 6, 6, 9, 6, 9, 9, 6, 6, 9, 9, 10, 2, 6, 6, 0, 4, 5, 5, 4, - 5, 4, 4, 5, 5, 3, 3, 5, 2, 5, 5, 9, 4, 3, 3, 4, 2, 4, 4, 9, 2, 3, 3, 0, - 2, 0, 0, 0, 6, 7, 7, 6, 7, 8, 6, 5, 7, 8, 8, 7, 6, 7, 5, 6, 7, 8, 8, 7, - 8, 10, 7, 6, 6, 7, 7, 8, 2, 6, 6, 0, 7, 8, 8, 7, 8, 10, 7, 5, 8, 10, 10, - 8, 7, 8, 5, 7, 4, 7, 7, 8, 7, 8, 8, 7, 2, 3, 3, 7, 2, 7, 0, 0, 6, 8, 8, - 6, 8, 10, 6, 5, 8, 10, 10, 8, 6, 8, 5, 6, 8, 10, 10, 8, 10, 11, 8, 6, 6, - 8, 8, 10, 2, 6, 6, 0, 4, 5, 5, 8, 5, 8, 8, 5, 5, 8, 8, 10, 8, 10, 5, 8, - 4, 3, 3, 4, 2, 4, 4, 8, 2, 3, 3, 8, 2, 0, 0, 0, 6, 7, 7, 6, 7, 6, 6, 5, - 7, 6, 6, 7, 6, 7, 5, 6, 7, 6, 6, 7, 6, 7, 7, 6, 6, 7, 7, 10, 2, 6, 6, 0, - 7, 5, 5, 7, 5, 7, 7, 5, 5, 7, 7, 10, 7, 10, 5, 7, 4, 7, 7, 10, 7, 10, - 10, 7, 2, 3, 3, 7, 2, 7, 0, 0, 6, 5, 5, 6, 5, 6, 6, 5, 5, 6, 6, 5, 6, 5, - 5, 6, 4, 6, 6, 4, 6, 4, 4, 6, 6, 3, 3, 6, 2, 6, 6, 0, 4, 5, 5, 4, 5, 4, - 4, 5, 5, 3, 3, 5, 2, 5, 5, 10, 4, 3, 3, 4, 2, 4, 4, 0, 2, 3, 3, 0, 2, 0, - 0, 0, 6, 7, 7, 6, 7, 8, 6, 5, 7, 8, 8, 7, 6, 7, 5, 6, 7, 8, 8, 7, 8, 9, - 7, 6, 6, 7, 7, 8, 2, 6, 6, 0, 7, 8, 8, 7, 8, 9, 7, 5, 8, 9, 9, 8, 7, 8, - 5, 7, 4, 7, 7, 8, 7, 8, 8, 7, 2, 3, 3, 7, 2, 7, 0, 0, 6, 8, 8, 6, 8, 9, - 6, 5, 8, 9, 9, 8, 6, 8, 5, 6, 8, 9, 9, 8, 9, 11, 8, 6, 6, 8, 8, 9, 2, 6, - 6, 0, 4, 5, 5, 8, 5, 8, 8, 5, 5, 8, 8, 9, 8, 9, 5, 8, 4, 3, 3, 4, 2, 4, - 4, 8, 2, 3, 3, 8, 2, 0, 0, 0, 6, 7, 7, 6, 7, 9, 6, 5, 7, 9, 9, 7, 6, 7, - 5, 6, 7, 9, 9, 7, 9, 11, 7, 6, 6, 7, 7, 9, 2, 6, 6, 0, 7, 9, 9, 7, 9, - 11, 7, 5, 9, 11, 11, 9, 7, 9, 5, 7, 4, 7, 7, 9, 7, 9, 9, 7, 2, 3, 3, 7, - 2, 7, 0, 0, 6, 5, 5, 6, 5, 6, 6, 5, 5, 6, 6, 9, 6, 9, 5, 6, 4, 6, 6, 9, - 6, 9, 9, 6, 6, 9, 9, 11, 2, 6, 6, 0, 4, 5, 5, 4, 5, 4, 4, 5, 5, 3, 3, 5, - 2, 5, 5, 9, 4, 3, 3, 4, 2, 4, 4, 9, 2, 3, 3, 0, 2, 0, 0, 0, 6, 7, 7, 6, - 7, 8, 6, 5, 7, 8, 8, 7, 6, 7, 5, 6, 7, 8, 8, 7, 8, 7, 7, 6, 6, 7, 7, 8, - 2, 6, 6, 0, 7, 8, 8, 7, 8, 7, 7, 5, 8, 7, 7, 8, 7, 8, 5, 7, 4, 7, 7, 8, - 7, 8, 8, 7, 2, 3, 3, 7, 2, 7, 0, 0, 6, 8, 8, 6, 8, 6, 6, 5, 8, 6, 6, 8, - 6, 8, 5, 6, 8, 6, 6, 8, 6, 8, 8, 6, 6, 8, 8, 11, 2, 6, 6, 0, 4, 5, 5, 8, - 5, 8, 8, 5, 5, 8, 8, 11, 8, 11, 5, 8, 4, 3, 3, 4, 2, 4, 4, 8, 2, 3, 3, - 8, 2, 0, 0, 0, 6, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, 7, 6, 7, 5, 6, 7, 6, 6, - 7, 6, 7, 7, 6, 6, 7, 7, 6, 2, 6, 6, 0, 7, 5, 5, 7, 5, 7, 7, 5, 5, 7, 7, - 5, 7, 5, 5, 7, 4, 7, 7, 4, 7, 4, 4, 7, 2, 3, 3, 7, 2, 7, 0, 0, 6, 5, 5, - 6, 5, 6, 6, 5, 5, 6, 6, 5, 6, 5, 5, 6, 4, 6, 6, 4, 6, 4, 4, 6, 6, 3, 3, - 6, 2, 6, 6, 0, 4, 5, 5, 4, 5, 4, 4, 5, 5, 3, 3, 5, 2, 5, 5, 0, 4, 3, 3, - 4, 2, 4, 4, 0, 2, 3, 3, 0, 2, 0, 0, 0, 6, 7, 7, 6, 7, 8, 6, 5, 7, 8, 8, - 7, 6, 7, 5, 6, 7, 8, 8, 7, 8, 9, 7, 6, 6, 7, 7, 8, 2, 6, 6, 0, 7, 8, 8, - 7, 8, 9, 7, 5, 8, 9, 9, 8, 7, 8, 5, 7, 4, 7, 7, 8, 7, 8, 8, 7, 2, 3, 3, - 7, 2, 7, 0, 0, 6, 8, 8, 6, 8, 9, 6, 5, 8, 9, 9, 8, 6, 8, 5, 6, 8, 9, 9, - 8, 9, 10, 8, 6, 6, 8, 8, 9, 2, 6, 6, 0, 4, 5, 5, 8, 5, 8, 8, 5, 5, 8, 8, - 9, 8, 9, 5, 8, 4, 3, 3, 4, 2, 4, 4, 8, 2, 3, 3, 8, 2, 0, 0, 0, 6, 7, 7, - 6, 7, 9, 6, 5, 7, 9, 9, 7, 6, 7, 5, 6, 7, 9, 9, 7, 9, 10, 7, 6, 6, 7, 7, - 9, 2, 6, 6, 0, 7, 9, 9, 7, 9, 10, 7, 5, 9, 10, 10, 9, 7, 9, 5, 7, 4, 7, - 7, 9, 7, 9, 9, 7, 2, 3, 3, 7, 2, 7, 0, 0, 6, 5, 5, 6, 5, 6, 6, 5, 5, 6, - 6, 9, 6, 9, 5, 6, 4, 6, 6, 9, 6, 9, 9, 6, 6, 9, 9, 10, 2, 6, 6, 0, 4, 5, - 5, 4, 5, 4, 4, 5, 5, 3, 3, 5, 2, 5, 5, 9, 4, 3, 3, 4, 2, 4, 4, 9, 2, 3, - 3, 0, 2, 0, 0, 0, 6, 7, 7, 6, 7, 8, 6, 5, 7, 8, 8, 7, 6, 7, 5, 6, 7, 8, - 8, 7, 8, 10, 7, 6, 6, 7, 7, 8, 2, 6, 6, 0, 7, 8, 8, 7, 8, 10, 7, 5, 8, - 10, 10, 8, 7, 8, 5, 7, 4, 7, 7, 8, 7, 8, 8, 7, 2, 3, 3, 7, 2, 7, 0, 0, - 6, 8, 8, 6, 8, 10, 6, 5, 8, 10, 10, 8, 6, 8, 5, 6, 8, 10, 10, 8, 10, 12, - 8, 6, 6, 8, 8, 10, 2, 6, 6, 0, 4, 5, 5, 8, 5, 8, 8, 5, 5, 8, 8, 10, 8, - 10, 5, 8, 4, 3, 3, 4, 2, 4, 4, 8, 2, 3, 3, 8, 2, 0, 0, 0, 6, 7, 7, 6, 7, - 6, 6, 5, 7, 6, 6, 7, 6, 7, 5, 6, 7, 6, 6, 7, 6, 7, 7, 6, 6, 7, 7, 10, 2, - 6, 6, 0, 7, 5, 5, 7, 5, 7, 7, 5, 5, 7, 7, 10, 7, 10, 5, 7, 4, 7, 7, 10, - 7, 10, 10, 7, 2, 3, 3, 7, 2, 7, 0, 0, 6, 5, 5, 6, 5, 6, 6, 5, 5, 6, 6, - 5, 6, 5, 5, 6, 4, 6, 6, 4, 6, 4, 4, 6, 6, 3, 3, 6, 2, 6, 6, 0, 4, 5, 5, - 4, 5, 4, 4, 5, 5, 3, 3, 5, 2, 5, 5, 10, 4, 3, 3, 4, 2, 4, 4, 0, 2, 3, 3, - 0, 2, 0, 0, 0, 6, 7, 7, 6, 7, 8, 6, 5, 7, 8, 8, 7, 6, 7, 5, 6, 7, 8, 8, - 7, 8, 9, 7, 6, 6, 7, 7, 8, 2, 6, 6, 0, 7, 8, 8, 7, 8, 9, 7, 5, 8, 9, 9, - 8, 7, 8, 5, 7, 4, 7, 7, 8, 7, 8, 8, 7, 2, 3, 3, 7, 2, 7, 0, 0, 6, 8, 8, - 6, 8, 9, 6, 5, 8, 9, 9, 8, 6, 8, 5, 6, 8, 9, 9, 8, 9, 8, 8, 6, 6, 8, 8, - 9, 2, 6, 6, 0, 4, 5, 5, 8, 5, 8, 8, 5, 5, 8, 8, 9, 8, 9, 5, 8, 4, 3, 3, - 4, 2, 4, 4, 8, 2, 3, 3, 8, 2, 0, 0, 0, 6, 7, 7, 6, 7, 9, 6, 5, 7, 9, 9, - 7, 6, 7, 5, 6, 7, 9, 9, 7, 9, 7, 7, 6, 6, 7, 7, 9, 2, 6, 6, 0, 7, 9, 9, - 7, 9, 7, 7, 5, 9, 7, 7, 9, 7, 9, 5, 7, 4, 7, 7, 9, 7, 9, 9, 7, 2, 3, 3, - 7, 2, 7, 0, 0, 6, 5, 5, 6, 5, 6, 6, 5, 5, 6, 6, 9, 6, 9, 5, 6, 4, 6, 6, - 9, 6, 9, 9, 6, 6, 9, 9, 12, 2, 6, 6, 0, 4, 5, 5, 4, 5, 4, 4, 5, 5, 3, 3, - 5, 2, 5, 5, 9, 4, 3, 3, 4, 2, 4, 4, 9, 2, 3, 3, 0, 2, 0, 0, 0, 6, 7, 7, - 6, 7, 8, 6, 5, 7, 8, 8, 7, 6, 7, 5, 6, 7, 8, 8, 7, 8, 7, 7, 6, 6, 7, 7, - 8, 2, 6, 6, 0, 7, 8, 8, 7, 8, 7, 7, 5, 8, 7, 7, 8, 7, 8, 5, 7, 4, 7, 7, - 8, 7, 8, 8, 7, 2, 3, 3, 7, 2, 7, 0, 0, 6, 8, 8, 6, 8, 6, 6, 5, 8, 6, 6, - 8, 6, 8, 5, 6, 8, 6, 6, 8, 6, 8, 8, 6, 6, 8, 8, 6, 2, 6, 6, 0, 4, 5, 5, - 8, 5, 8, 8, 5, 5, 8, 8, 5, 8, 5, 5, 8, 4, 3, 3, 4, 2, 4, 4, 8, 2, 3, 3, - 8, 2, 0, 0, 0, 6, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, 7, 6, 7, 5, 6, 7, 6, 6, - 7, 6, 7, 7, 6, 6, 7, 7, 6, 2, 6, 6, 0, 7, 5, 5, 7, 5, 7, 7, 5, 5, 7, 7, - 5, 7, 5, 5, 7, 4, 7, 7, 4, 7, 4, 4, 7, 2, 3, 3, 7, 2, 7, 0, 0, 6, 5, 5, - 6, 5, 6, 6, 5, 5, 6, 6, 5, 6, 5, 5, 6, 4, 6, 6, 4, 6, 4, 4, 6, 6, 3, 3, - 6, 2, 6, 6, 0, 4, 5, 5, 4, 5, 4, 4, 5, 5, 3, 3, 5, 2, 5, 5, 0, 4, 3, 3, - 4, 2, 4, 4, 0, 2, 3, 3, 0, 2, 0, 0, 0, 6, 7, 7, 6, 7, 8, 6, 5, 7, 8, 8, - 7, 6, 7, 5, 6, 7, 8, 8, 7, 8, 9, 7, 6, 6, 7, 7, 8, 2, 6, 6, 0, 7, 8, 8, - 7, 8, 9, 7, 5, 8, 9, 9, 8, 7, 8, 5, 7, 4, 7, 7, 8, 7, 8, 8, 7, 2, 3, 3, - 7, 2, 7, 0, 0, 6, 8, 8, 6, 8, 9, 6, 5, 8, 9, 9, 8, 6, 8, 5, 6, 8, 9, 9, - 8, 9, 10, 8, 6, 6, 8, 8, 9, 2, 6, 6, 0, 4, 5, 5, 8, 5, 8, 8, 5, 5, 8, 8, - 9, 8, 9, 5, 8, 4, 3, 3, 4, 2, 4, 4, 8, 2, 3, 3, 8, 2, 0, 0, 0, 6, 7, 7, - 6, 7, 9, 6, 5, 7, 9, 9, 7, 6, 7, 5, 6, 7, 9, 9, 7, 9, 10, 7, 6, 6, 7, 7, - 9, 2, 6, 6, 0, 7, 9, 9, 7, 9, 10, 7, 5, 9, 10, 10, 9, 7, 9, 5, 7, 4, 7, - 7, 9, 7, 9, 9, 7, 2, 3, 3, 7, 2, 7, 0, 0, 6, 5, 5, 6, 5, 6, 6, 5, 5, 6, - 6, 9, 6, 9, 5, 6, 4, 6, 6, 9, 6, 9, 9, 6, 6, 9, 9, 10, 2, 6, 6, 0, 4, 5, - 5, 4, 5, 4, 4, 5, 5, 3, 3, 5, 2, 5, 5, 9, 4, 3, 3, 4, 2, 4, 4, 9, 2, 3, - 3, 0, 2, 0, 0, 0, 6, 7, 7, 6, 7, 8, 6, 5, 7, 8, 8, 7, 6, 7, 5, 6, 7, 8, - 8, 7, 8, 10, 7, 6, 6, 7, 7, 8, 2, 6, 6, 0, 7, 8, 8, 7, 8, 10, 7, 5, 8, - 10, 10, 8, 7, 8, 5, 7, 4, 7, 7, 8, 7, 8, 8, 7, 2, 3, 3, 7, 2, 7, 0, 0, - 6, 8, 8, 6, 8, 10, 6, 5, 8, 10, 10, 8, 6, 8, 5, 6, 8, 10, 10, 8, 10, 11, - 8, 6, 6, 8, 8, 10, 2, 6, 6, 0, 4, 5, 5, 8, 5, 8, 8, 5, 5, 8, 8, 10, 8, - 10, 5, 8, 4, 3, 3, 4, 2, 4, 4, 8, 2, 3, 3, 8, 2, 0, 0, 0, 6, 7, 7, 6, 7, - 6, 6, 5, 7, 6, 6, 7, 6, 7, 5, 6, 7, 6, 6, 7, 6, 7, 7, 6, 6, 7, 7, 10, 2, - 6, 6, 0, 7, 5, 5, 7, 5, 7, 7, 5, 5, 7, 7, 10, 7, 10, 5, 7, 4, 7, 7, 10, - 7, 10, 10, 7, 2, 3, 3, 7, 2, 7, 0, 0, 6, 5, 5, 6, 5, 6, 6, 5, 5, 6, 6, - 5, 6, 5, 5, 6, 4, 6, 6, 4, 6, 4, 4, 6, 6, 3, 3, 6, 2, 6, 6, 0, 4, 5, 5, - 4, 5, 4, 4, 5, 5, 3, 3, 5, 2, 5, 5, 10, 4, 3, 3, 4, 2, 4, 4, 0, 2, 3, 3, - 0, 2, 0, 0, 0, 6, 7, 7, 6, 7, 8, 6, 5, 7, 8, 8, 7, 6, 7, 5, 6, 7, 8, 8, - 7, 8, 9, 7, 6, 6, 7, 7, 8, 2, 6, 6, 0, 7, 8, 8, 7, 8, 9, 7, 5, 8, 9, 9, - 8, 7, 8, 5, 7, 4, 7, 7, 8, 7, 8, 8, 7, 2, 3, 3, 7, 2, 7, 0, 0, 6, 8, 8, - 6, 8, 9, 6, 5, 8, 9, 9, 8, 6, 8, 5, 6, 8, 9, 9, 8, 9, 11, 8, 6, 6, 8, 8, - 9, 2, 6, 6, 0, 4, 5, 5, 8, 5, 8, 8, 5, 5, 8, 8, 9, 8, 9, 5, 8, 4, 3, 3, - 4, 2, 4, 4, 8, 2, 3, 3, 8, 2, 0, 0, 0, 6, 7, 7, 6, 7, 9, 6, 5, 7, 9, 9, - 7, 6, 7, 5, 6, 7, 9, 9, 7, 9, 11, 7, 6, 6, 7, 7, 9, 2, 6, 6, 0, 7, 9, 9, - 7, 9, 11, 7, 5, 9, 11, 11, 9, 7, 9, 5, 7, 4, 7, 7, 9, 7, 9, 9, 7, 2, 3, - 3, 7, 2, 7, 0, 0, 6, 5, 5, 6, 5, 6, 6, 5, 5, 6, 6, 9, 6, 9, 5, 6, 4, 6, - 6, 9, 6, 9, 9, 6, 6, 9, 9, 11, 2, 6, 6, 0, 4, 5, 5, 4, 5, 4, 4, 5, 5, 3, - 3, 5, 2, 5, 5, 9, 4, 3, 3, 4, 2, 4, 4, 9, 2, 3, 3, 0, 2, 0, 0, 0, 6, 7, - 7, 6, 7, 8, 6, 5, 7, 8, 8, 7, 6, 7, 5, 6, 7, 8, 8, 7, 8, 7, 7, 6, 6, 7, - 7, 8, 2, 6, 6, 0, 7, 8, 8, 7, 8, 7, 7, 5, 8, 7, 7, 8, 7, 8, 5, 7, 4, 7, - 7, 8, 7, 8, 8, 7, 2, 3, 3, 7, 2, 7, 0, 0, 6, 8, 8, 6, 8, 6, 6, 5, 8, 6, - 6, 8, 6, 8, 5, 6, 8, 6, 6, 8, 6, 8, 8, 6, 6, 8, 8, 11, 2, 6, 6, 0, 4, 5, - 5, 8, 5, 8, 8, 5, 5, 8, 8, 11, 8, 11, 5, 8, 4, 3, 3, 4, 2, 4, 4, 8, 2, - 3, 3, 8, 2, 0, 0, 0, 6, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, 7, 6, 7, 5, 6, 7, - 6, 6, 7, 6, 7, 7, 6, 6, 7, 7, 6, 2, 6, 6, 0, 7, 5, 5, 7, 5, 7, 7, 5, 5, - 7, 7, 5, 7, 5, 5, 7, 4, 7, 7, 4, 7, 4, 4, 7, 2, 3, 3, 7, 2, 7, 0, 0, 6, - 5, 5, 6, 5, 6, 6, 5, 5, 6, 6, 5, 6, 5, 5, 6, 4, 6, 6, 4, 6, 4, 4, 6, 6, - 3, 3, 6, 2, 6, 6, 0, 4, 5, 5, 4, 5, 4, 4, 5, 5, 3, 3, 5, 2, 5, 5, 0, 4, - 3, 3, 4, 2, 4, 4, 0, 2, 3, 3, 0, 2, 0, 0, 0, 6, 7, 7, 6, 7, 8, 6, 5, 7, - 8, 8, 7, 6, 7, 5, 6, 7, 8, 8, 7, 8, 9, 7, 6, 6, 7, 7, 8, 2, 6, 6, 0, 7, - 8, 8, 7, 8, 9, 7, 5, 8, 9, 9, 8, 7, 8, 5, 7, 4, 7, 7, 8, 7, 8, 8, 7, 2, - 3, 3, 7, 2, 7, 0, 0, 6, 8, 8, 6, 8, 9, 6, 5, 8, 9, 9, 8, 6, 8, 5, 6, 8, - 9, 9, 8, 9, 10, 8, 6, 6, 8, 8, 9, 2, 6, 6, 0, 4, 5, 5, 8, 5, 8, 8, 5, 5, - 8, 8, 9, 8, 9, 5, 8, 4, 3, 3, 4, 2, 4, 4, 8, 2, 3, 3, 8, 2, 0, 0, 0, 6, - 7, 7, 6, 7, 9, 6, 5, 7, 9, 9, 7, 6, 7, 5, 6, 7, 9, 9, 7, 9, 10, 7, 6, 6, - 7, 7, 9, 2, 6, 6, 0, 7, 9, 9, 7, 9, 10, 7, 5, 9, 10, 10, 9, 7, 9, 5, 7, - 4, 7, 7, 9, 7, 9, 9, 7, 2, 3, 3, 7, 2, 7, 0, 0, 6, 5, 5, 6, 5, 6, 6, 5, - 5, 6, 6, 9, 6, 9, 5, 6, 4, 6, 6, 9, 6, 9, 9, 6, 6, 9, 9, 10, 2, 6, 6, 0, - 4, 5, 5, 4, 5, 4, 4, 5, 5, 3, 3, 5, 2, 5, 5, 9, 4, 3, 3, 4, 2, 4, 4, 9, - 2, 3, 3, 0, 2, 0, 0, 0, 6, 7, 7, 6, 7, 8, 6, 5, 7, 8, 8, 7, 6, 7, 5, 6, - 7, 8, 8, 7, 8, 10, 7, 6, 6, 7, 7, 8, 2, 6, 6, 0, 7, 8, 8, 7, 8, 10, 7, - 5, 8, 10, 10, 8, 7, 8, 5, 7, 4, 7, 7, 8, 7, 8, 8, 7, 2, 3, 3, 7, 2, 7, - 0, 0, 6, 8, 8, 6, 8, 10, 6, 5, 8, 10, 10, 8, 6, 8, 5, 6, 8, 10, 10, 8, - 10, 8, 8, 6, 6, 8, 8, 10, 2, 6, 6, 0, 4, 5, 5, 8, 5, 8, 8, 5, 5, 8, 8, - 10, 8, 10, 5, 8, 4, 3, 3, 4, 2, 4, 4, 8, 2, 3, 3, 8, 2, 0, 0, 0, 6, 7, - 7, 6, 7, 6, 6, 5, 7, 6, 6, 7, 6, 7, 5, 6, 7, 6, 6, 7, 6, 7, 7, 6, 6, 7, - 7, 10, 2, 6, 6, 0, 7, 5, 5, 7, 5, 7, 7, 5, 5, 7, 7, 10, 7, 10, 5, 7, 4, - 7, 7, 10, 7, 10, 10, 7, 2, 3, 3, 7, 2, 7, 0, 0, 6, 5, 5, 6, 5, 6, 6, 5, - 5, 6, 6, 5, 6, 5, 5, 6, 4, 6, 6, 4, 6, 4, 4, 6, 6, 3, 3, 6, 2, 6, 6, 0, - 4, 5, 5, 4, 5, 4, 4, 5, 5, 3, 3, 5, 2, 5, 5, 10, 4, 3, 3, 4, 2, 4, 4, 0, - 2, 3, 3, 0, 2, 0, 0, 0, 6, 7, 7, 6, 7, 8, 6, 5, 7, 8, 8, 7, 6, 7, 5, 6, - 7, 8, 8, 7, 8, 9, 7, 6, 6, 7, 7, 8, 2, 6, 6, 0, 7, 8, 8, 7, 8, 9, 7, 5, - 8, 9, 9, 8, 7, 8, 5, 7, 4, 7, 7, 8, 7, 8, 8, 7, 2, 3, 3, 7, 2, 7, 0, 0, - 6, 8, 8, 6, 8, 9, 6, 5, 8, 9, 9, 8, 6, 8, 5, 6, 8, 9, 9, 8, 9, 8, 8, 6, - 6, 8, 8, 9, 2, 6, 6, 0, 4, 5, 5, 8, 5, 8, 8, 5, 5, 8, 8, 9, 8, 9, 5, 8, - 4, 3, 3, 4, 2, 4, 4, 8, 2, 3, 3, 8, 2, 0, 0, 0, 6, 7, 7, 6, 7, 9, 6, 5, - 7, 9, 9, 7, 6, 7, 5, 6, 7, 9, 9, 7, 9, 7, 7, 6, 6, 7, 7, 9, 2, 6, 6, 0, - 7, 9, 9, 7, 9, 7, 7, 5, 9, 7, 7, 9, 7, 9, 5, 7, 4, 7, 7, 9, 7, 9, 9, 7, - 2, 3, 3, 7, 2, 7, 0, 0, 6, 5, 5, 6, 5, 6, 6, 5, 5, 6, 6, 9, 6, 9, 5, 6, - 4, 6, 6, 9, 6, 9, 9, 6, 6, 9, 9, 6, 2, 6, 6, 0, 4, 5, 5, 4, 5, 4, 4, 5, - 5, 3, 3, 5, 2, 5, 5, 9, 4, 3, 3, 4, 2, 4, 4, 9, 2, 3, 3, 0, 2, 0, 0, 0, - 6, 7, 7, 6, 7, 8, 6, 5, 7, 8, 8, 7, 6, 7, 5, 6, 7, 8, 8, 7, 8, 7, 7, 6, - 6, 7, 7, 8, 2, 6, 6, 0, 7, 8, 8, 7, 8, 7, 7, 5, 8, 7, 7, 8, 7, 8, 5, 7, - 4, 7, 7, 8, 7, 8, 8, 7, 2, 3, 3, 7, 2, 7, 0, 0, 6, 8, 8, 6, 8, 6, 6, 5, - 8, 6, 6, 8, 6, 8, 5, 6, 8, 6, 6, 8, 6, 8, 8, 6, 6, 8, 8, 6, 2, 6, 6, 0, - 4, 5, 5, 8, 5, 8, 8, 5, 5, 8, 8, 5, 8, 5, 5, 8, 4, 3, 3, 4, 2, 4, 4, 8, - 2, 3, 3, 8, 2, 0, 0, 0, 6, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, 7, 6, 7, 5, 6, - 7, 6, 6, 7, 6, 7, 7, 6, 6, 7, 7, 6, 2, 6, 6, 0, 7, 5, 5, 7, 5, 7, 7, 5, - 5, 7, 7, 5, 7, 5, 5, 7, 4, 7, 7, 4, 7, 4, 4, 7, 2, 3, 3, 7, 2, 7, 0, 0, - 6, 5, 5, 6, 5, 6, 6, 5, 5, 6, 6, 5, 6, 5, 5, 6, 4, 6, 6, 4, 6, 4, 4, 6, - 6, 3, 3, 6, 2, 6, 6, 0, 4, 5, 5, 4, 5, 4, 4, 5, 5, 3, 3, 5, 2, 5, 5, 0, - 4, 3, 3, 4, 2, 4, 4, 0, 2, 3, 3, 0, 2, 0, 0, 0, }; - -typedef struct index_bytes_consumed { - uint8_t index; - uint8_t bytes_consumed; -} index_bytes_consumed; - -static index_bytes_consumed combined_lookup[sizeof(bytes_consumed)]; - -static __m128i vectors[170]; - -void simdvbyteinit(void) { - vectors[0] = _mm_setr_epi8(0, -1, 4, -1, 1, -1, 5, -1, 2, -1, -1, -1, 3, -1, - -1, -1); - vectors[1] = _mm_setr_epi8(0, -1, 4, -1, 1, -1, 5, 6, 2, -1, -1, -1, 3, -1, - -1, -1); - vectors[2] = _mm_setr_epi8(0, -1, 4, 5, 1, -1, 6, -1, 2, -1, -1, -1, 3, -1, - -1, -1); - vectors[3] = _mm_setr_epi8(0, -1, 4, 5, 1, -1, 6, 7, 2, -1, -1, -1, 3, -1, - -1, -1); - vectors[4] = _mm_setr_epi8(0, -1, 5, -1, 1, -1, 6, -1, 2, -1, -1, -1, 3, 4, - -1, -1); - vectors[5] = _mm_setr_epi8(0, -1, 5, -1, 1, -1, 6, 7, 2, -1, -1, -1, 3, 4, - -1, -1); - vectors[6] = _mm_setr_epi8(0, -1, 5, 6, 1, -1, 7, -1, 2, -1, -1, -1, 3, 4, - -1, -1); - vectors[7] = _mm_setr_epi8(0, -1, 5, 6, 1, -1, 7, 8, 2, -1, -1, -1, 3, 4, - -1, -1); - vectors[8] = _mm_setr_epi8(0, -1, 5, -1, 1, -1, 6, -1, 2, 3, -1, -1, 4, -1, - -1, -1); - vectors[9] = _mm_setr_epi8(0, -1, 5, -1, 1, -1, 6, 7, 2, 3, -1, -1, 4, -1, - -1, -1); - vectors[10] = _mm_setr_epi8(0, -1, 5, 6, 1, -1, 7, -1, 2, 3, -1, -1, 4, -1, - -1, -1); - vectors[11] = _mm_setr_epi8(0, -1, 5, 6, 1, -1, 7, 8, 2, 3, -1, -1, 4, -1, - -1, -1); - vectors[12] = _mm_setr_epi8(0, -1, 6, -1, 1, -1, 7, -1, 2, 3, -1, -1, 4, 5, - -1, -1); - vectors[13] = _mm_setr_epi8(0, -1, 6, -1, 1, -1, 7, 8, 2, 3, -1, -1, 4, 5, - -1, -1); - vectors[14] = _mm_setr_epi8(0, -1, 6, 7, 1, -1, 8, -1, 2, 3, -1, -1, 4, 5, - -1, -1); - vectors[15] = _mm_setr_epi8(0, -1, 6, 7, 1, -1, 8, 9, 2, 3, -1, -1, 4, 5, - -1, -1); - vectors[16] = _mm_setr_epi8(0, -1, 5, -1, 1, 2, 6, -1, 3, -1, -1, -1, 4, -1, - -1, -1); - vectors[17] = _mm_setr_epi8(0, -1, 5, -1, 1, 2, 6, 7, 3, -1, -1, -1, 4, -1, - -1, -1); - vectors[18] = _mm_setr_epi8(0, -1, 5, 6, 1, 2, 7, -1, 3, -1, -1, -1, 4, -1, - -1, -1); - vectors[19] = _mm_setr_epi8(0, -1, 5, 6, 1, 2, 7, 8, 3, -1, -1, -1, 4, -1, - -1, -1); - vectors[20] = _mm_setr_epi8(0, -1, 6, -1, 1, 2, 7, -1, 3, -1, -1, -1, 4, 5, - -1, -1); - vectors[21] = _mm_setr_epi8(0, -1, 6, -1, 1, 2, 7, 8, 3, -1, -1, -1, 4, 5, - -1, -1); - vectors[22] = _mm_setr_epi8(0, -1, 6, 7, 1, 2, 8, -1, 3, -1, -1, -1, 4, 5, - -1, -1); - vectors[23] = _mm_setr_epi8(0, -1, 6, 7, 1, 2, 8, 9, 3, -1, -1, -1, 4, 5, - -1, -1); - vectors[24] = _mm_setr_epi8(0, -1, 6, -1, 1, 2, 7, -1, 3, 4, -1, -1, 5, -1, - -1, -1); - vectors[25] = _mm_setr_epi8(0, -1, 6, -1, 1, 2, 7, 8, 3, 4, -1, -1, 5, -1, - -1, -1); - vectors[26] = _mm_setr_epi8(0, -1, 6, 7, 1, 2, 8, -1, 3, 4, -1, -1, 5, -1, - -1, -1); - vectors[27] = _mm_setr_epi8(0, -1, 6, 7, 1, 2, 8, 9, 3, 4, -1, -1, 5, -1, - -1, -1); - vectors[28] = _mm_setr_epi8(0, -1, 7, -1, 1, 2, 8, -1, 3, 4, -1, -1, 5, 6, - -1, -1); - vectors[29] = _mm_setr_epi8(0, -1, 7, -1, 1, 2, 8, 9, 3, 4, -1, -1, 5, 6, - -1, -1); - vectors[30] = _mm_setr_epi8(0, -1, 7, 8, 1, 2, 9, -1, 3, 4, -1, -1, 5, 6, - -1, -1); - vectors[31] = _mm_setr_epi8(0, -1, 7, 8, 1, 2, 9, 10, 3, 4, -1, -1, 5, 6, - -1, -1); - vectors[32] = _mm_setr_epi8(0, 1, 5, -1, 2, -1, 6, -1, 3, -1, -1, -1, 4, -1, - -1, -1); - vectors[33] = _mm_setr_epi8(0, 1, 5, -1, 2, -1, 6, 7, 3, -1, -1, -1, 4, -1, - -1, -1); - vectors[34] = _mm_setr_epi8(0, 1, 5, 6, 2, -1, 7, -1, 3, -1, -1, -1, 4, -1, - -1, -1); - vectors[35] = _mm_setr_epi8(0, 1, 5, 6, 2, -1, 7, 8, 3, -1, -1, -1, 4, -1, - -1, -1); - vectors[36] = _mm_setr_epi8(0, 1, 6, -1, 2, -1, 7, -1, 3, -1, -1, -1, 4, 5, - -1, -1); - vectors[37] = _mm_setr_epi8(0, 1, 6, -1, 2, -1, 7, 8, 3, -1, -1, -1, 4, 5, - -1, -1); - vectors[38] = _mm_setr_epi8(0, 1, 6, 7, 2, -1, 8, -1, 3, -1, -1, -1, 4, 5, - -1, -1); - vectors[39] = _mm_setr_epi8(0, 1, 6, 7, 2, -1, 8, 9, 3, -1, -1, -1, 4, 5, - -1, -1); - vectors[40] = _mm_setr_epi8(0, 1, 6, -1, 2, -1, 7, -1, 3, 4, -1, -1, 5, -1, - -1, -1); - vectors[41] = _mm_setr_epi8(0, 1, 6, -1, 2, -1, 7, 8, 3, 4, -1, -1, 5, -1, - -1, -1); - vectors[42] = _mm_setr_epi8(0, 1, 6, 7, 2, -1, 8, -1, 3, 4, -1, -1, 5, -1, - -1, -1); - vectors[43] = _mm_setr_epi8(0, 1, 6, 7, 2, -1, 8, 9, 3, 4, -1, -1, 5, -1, - -1, -1); - vectors[44] = _mm_setr_epi8(0, 1, 7, -1, 2, -1, 8, -1, 3, 4, -1, -1, 5, 6, - -1, -1); - vectors[45] = _mm_setr_epi8(0, 1, 7, -1, 2, -1, 8, 9, 3, 4, -1, -1, 5, 6, - -1, -1); - vectors[46] = _mm_setr_epi8(0, 1, 7, 8, 2, -1, 9, -1, 3, 4, -1, -1, 5, 6, - -1, -1); - vectors[47] = _mm_setr_epi8(0, 1, 7, 8, 2, -1, 9, 10, 3, 4, -1, -1, 5, 6, - -1, -1); - vectors[48] = _mm_setr_epi8(0, 1, 6, -1, 2, 3, 7, -1, 4, -1, -1, -1, 5, -1, - -1, -1); - vectors[49] = _mm_setr_epi8(0, 1, 6, -1, 2, 3, 7, 8, 4, -1, -1, -1, 5, -1, - -1, -1); - vectors[50] = _mm_setr_epi8(0, 1, 6, 7, 2, 3, 8, -1, 4, -1, -1, -1, 5, -1, - -1, -1); - vectors[51] = _mm_setr_epi8(0, 1, 6, 7, 2, 3, 8, 9, 4, -1, -1, -1, 5, -1, - -1, -1); - vectors[52] = _mm_setr_epi8(0, 1, 7, -1, 2, 3, 8, -1, 4, -1, -1, -1, 5, 6, - -1, -1); - vectors[53] = _mm_setr_epi8(0, 1, 7, -1, 2, 3, 8, 9, 4, -1, -1, -1, 5, 6, - -1, -1); - vectors[54] = _mm_setr_epi8(0, 1, 7, 8, 2, 3, 9, -1, 4, -1, -1, -1, 5, 6, - -1, -1); - vectors[55] = _mm_setr_epi8(0, 1, 7, 8, 2, 3, 9, 10, 4, -1, -1, -1, 5, 6, - -1, -1); - vectors[56] = _mm_setr_epi8(0, 1, 7, -1, 2, 3, 8, -1, 4, 5, -1, -1, 6, -1, - -1, -1); - vectors[57] = _mm_setr_epi8(0, 1, 7, -1, 2, 3, 8, 9, 4, 5, -1, -1, 6, -1, - -1, -1); - vectors[58] = _mm_setr_epi8(0, 1, 7, 8, 2, 3, 9, -1, 4, 5, -1, -1, 6, -1, - -1, -1); - vectors[59] = _mm_setr_epi8(0, 1, 7, 8, 2, 3, 9, 10, 4, 5, -1, -1, 6, -1, - -1, -1); - vectors[60] = _mm_setr_epi8(0, 1, 8, -1, 2, 3, 9, -1, 4, 5, -1, -1, 6, 7, - -1, -1); - vectors[61] = _mm_setr_epi8(0, 1, 8, -1, 2, 3, 9, 10, 4, 5, -1, -1, 6, 7, - -1, -1); - vectors[62] = _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, -1, 4, 5, -1, -1, 6, 7, - -1, -1); - vectors[63] = _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, -1, -1, 6, 7, - -1, -1); - vectors[64] = _mm_setr_epi8(0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, - -1, -1, -1); - vectors[65] = _mm_setr_epi8(0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, - 4, -1, -1); - vectors[66] = _mm_setr_epi8(0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3, - 4, 5, -1); - vectors[67] = _mm_setr_epi8(0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, - -1, -1, -1); - vectors[68] = _mm_setr_epi8(0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, - 5, -1, -1); - vectors[69] = _mm_setr_epi8(0, -1, -1, -1, 1, -1, -1, -1, 2, 3, -1, -1, 4, - 5, 6, -1); - vectors[70] = _mm_setr_epi8(0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, - -1, -1, -1); - vectors[71] = _mm_setr_epi8(0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, - -1, -1); - vectors[72] = _mm_setr_epi8(0, -1, -1, -1, 1, -1, -1, -1, 2, 3, 4, -1, 5, 6, - 7, -1); - vectors[73] = _mm_setr_epi8(0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, - -1, -1, -1); - vectors[74] = _mm_setr_epi8(0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, - 5, -1, -1); - vectors[75] = _mm_setr_epi8(0, -1, -1, -1, 1, 2, -1, -1, 3, -1, -1, -1, 4, - 5, 6, -1); - vectors[76] = _mm_setr_epi8(0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, - -1, -1, -1); - vectors[77] = _mm_setr_epi8(0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, - -1, -1); - vectors[78] = _mm_setr_epi8(0, -1, -1, -1, 1, 2, -1, -1, 3, 4, -1, -1, 5, 6, - 7, -1); - vectors[79] = _mm_setr_epi8(0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, -1, - -1, -1); - vectors[80] = _mm_setr_epi8(0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, - -1, -1); - vectors[81] = _mm_setr_epi8(0, -1, -1, -1, 1, 2, -1, -1, 3, 4, 5, -1, 6, 7, - 8, -1); - vectors[82] = _mm_setr_epi8(0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, - -1, -1, -1); - vectors[83] = _mm_setr_epi8(0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, - -1, -1); - vectors[84] = _mm_setr_epi8(0, -1, -1, -1, 1, 2, 3, -1, 4, -1, -1, -1, 5, 6, - 7, -1); - vectors[85] = _mm_setr_epi8(0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, -1, - -1, -1); - vectors[86] = _mm_setr_epi8(0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, - -1, -1); - vectors[87] = _mm_setr_epi8(0, -1, -1, -1, 1, 2, 3, -1, 4, 5, -1, -1, 6, 7, - 8, -1); - vectors[88] = _mm_setr_epi8(0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, -1, - -1, -1); - vectors[89] = _mm_setr_epi8(0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, - -1, -1); - vectors[90] = _mm_setr_epi8(0, -1, -1, -1, 1, 2, 3, -1, 4, 5, 6, -1, 7, 8, - 9, -1); - vectors[91] = _mm_setr_epi8(0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, - -1, -1, -1); - vectors[92] = _mm_setr_epi8(0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, - 5, -1, -1); - vectors[93] = _mm_setr_epi8(0, 1, -1, -1, 2, -1, -1, -1, 3, -1, -1, -1, 4, - 5, 6, -1); - vectors[94] = _mm_setr_epi8(0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, - -1, -1, -1); - vectors[95] = _mm_setr_epi8(0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, - -1, -1); - vectors[96] = _mm_setr_epi8(0, 1, -1, -1, 2, -1, -1, -1, 3, 4, -1, -1, 5, 6, - 7, -1); - vectors[97] = _mm_setr_epi8(0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, -1, - -1, -1); - vectors[98] = _mm_setr_epi8(0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, - -1, -1); - vectors[99] = _mm_setr_epi8(0, 1, -1, -1, 2, -1, -1, -1, 3, 4, 5, -1, 6, 7, - 8, -1); - vectors[100] = _mm_setr_epi8(0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, - -1, -1, -1); - vectors[101] = _mm_setr_epi8(0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, - 6, -1, -1); - vectors[102] = _mm_setr_epi8(0, 1, -1, -1, 2, 3, -1, -1, 4, -1, -1, -1, 5, - 6, 7, -1); - vectors[103] = _mm_setr_epi8(0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, - -1, -1, -1); - vectors[104] = _mm_setr_epi8(0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, - -1, -1); - vectors[105] = _mm_setr_epi8(0, 1, -1, -1, 2, 3, -1, -1, 4, 5, -1, -1, 6, 7, - 8, -1); - vectors[106] = _mm_setr_epi8(0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, -1, - -1, -1); - vectors[107] = _mm_setr_epi8(0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, - -1, -1); - vectors[108] = _mm_setr_epi8(0, 1, -1, -1, 2, 3, -1, -1, 4, 5, 6, -1, 7, 8, - 9, -1); - vectors[109] = _mm_setr_epi8(0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, - -1, -1, -1); - vectors[110] = _mm_setr_epi8(0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, - -1, -1); - vectors[111] = _mm_setr_epi8(0, 1, -1, -1, 2, 3, 4, -1, 5, -1, -1, -1, 6, 7, - 8, -1); - vectors[112] = _mm_setr_epi8(0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, -1, - -1, -1); - vectors[113] = _mm_setr_epi8(0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, - -1, -1); - vectors[114] = _mm_setr_epi8(0, 1, -1, -1, 2, 3, 4, -1, 5, 6, -1, -1, 7, 8, - 9, -1); - vectors[115] = _mm_setr_epi8(0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, -1, - -1, -1); - vectors[116] = _mm_setr_epi8(0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, - -1, -1); - vectors[117] = _mm_setr_epi8(0, 1, -1, -1, 2, 3, 4, -1, 5, 6, 7, -1, 8, 9, - 10, -1); - vectors[118] = _mm_setr_epi8(0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, - -1, -1, -1); - vectors[119] = _mm_setr_epi8(0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, - 6, -1, -1); - vectors[120] = _mm_setr_epi8(0, 1, 2, -1, 3, -1, -1, -1, 4, -1, -1, -1, 5, - 6, 7, -1); - vectors[121] = _mm_setr_epi8(0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, - -1, -1, -1); - vectors[122] = _mm_setr_epi8(0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, - -1, -1); - vectors[123] = _mm_setr_epi8(0, 1, 2, -1, 3, -1, -1, -1, 4, 5, -1, -1, 6, 7, - 8, -1); - vectors[124] = _mm_setr_epi8(0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, -1, - -1, -1); - vectors[125] = _mm_setr_epi8(0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, - -1, -1); - vectors[126] = _mm_setr_epi8(0, 1, 2, -1, 3, -1, -1, -1, 4, 5, 6, -1, 7, 8, - 9, -1); - vectors[127] = _mm_setr_epi8(0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, - -1, -1, -1); - vectors[128] = _mm_setr_epi8(0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, - -1, -1); - vectors[129] = _mm_setr_epi8(0, 1, 2, -1, 3, 4, -1, -1, 5, -1, -1, -1, 6, 7, - 8, -1); - vectors[130] = _mm_setr_epi8(0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, -1, - -1, -1); - vectors[131] = _mm_setr_epi8(0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, - -1, -1); - vectors[132] = _mm_setr_epi8(0, 1, 2, -1, 3, 4, -1, -1, 5, 6, -1, -1, 7, 8, - 9, -1); - vectors[133] = _mm_setr_epi8(0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, -1, - -1, -1); - vectors[134] = _mm_setr_epi8(0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, - -1, -1); - vectors[135] = _mm_setr_epi8(0, 1, 2, -1, 3, 4, -1, -1, 5, 6, 7, -1, 8, 9, - 10, -1); - vectors[136] = _mm_setr_epi8(0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, -1, - -1, -1); - vectors[137] = _mm_setr_epi8(0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, - -1, -1); - vectors[138] = _mm_setr_epi8(0, 1, 2, -1, 3, 4, 5, -1, 6, -1, -1, -1, 7, 8, - 9, -1); - vectors[139] = _mm_setr_epi8(0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, -1, - -1, -1); - vectors[140] = _mm_setr_epi8(0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, - -1, -1); - vectors[141] = _mm_setr_epi8(0, 1, 2, -1, 3, 4, 5, -1, 6, 7, -1, -1, 8, 9, - 10, -1); - vectors[142] = _mm_setr_epi8(0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, -1, - -1, -1); - vectors[143] = _mm_setr_epi8(0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, - -1, -1); - vectors[144] = _mm_setr_epi8(0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, - 11, -1); - vectors[145] = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, -1, - -1, -1, -1, 1); - vectors[146] = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, 0, 2, -1, -1, -1, - -1, -1, -1, 1); - vectors[147] = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, 0, 2, -1, 3, -1, - -1, -1, -1, 1); - vectors[148] = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, 0, 2, -1, 3, -1, 4, - -1, -1, 1); - vectors[149] = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, 0, 2, -1, 3, -1, 4, - -1, 5, 1); - vectors[150] = _mm_setr_epi8(1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, -1, - -1, -1, -1, 2); - vectors[151] = _mm_setr_epi8(1, -1, -1, -1, -1, -1, -1, 0, 3, -1, -1, -1, - -1, -1, -1, 2); - vectors[152] = _mm_setr_epi8(1, -1, -1, -1, -1, -1, -1, 0, 3, -1, 4, -1, -1, - -1, -1, 2); - vectors[153] = _mm_setr_epi8(1, -1, -1, -1, -1, -1, -1, 0, 3, -1, 4, -1, 5, - -1, -1, 2); - vectors[154] = _mm_setr_epi8(1, -1, -1, -1, -1, -1, -1, 0, 3, -1, 4, -1, 5, - -1, 6, 2); - vectors[155] = _mm_setr_epi8(1, -1, 2, -1, -1, -1, -1, 0, -1, -1, -1, -1, - -1, -1, -1, 3); - vectors[156] = _mm_setr_epi8(1, -1, 2, -1, -1, -1, -1, 0, 4, -1, -1, -1, -1, - -1, -1, 3); - vectors[157] = _mm_setr_epi8(1, -1, 2, -1, -1, -1, -1, 0, 4, -1, 5, -1, -1, - -1, -1, 3); - vectors[158] = _mm_setr_epi8(1, -1, 2, -1, -1, -1, -1, 0, 4, -1, 5, -1, 6, - -1, -1, 3); - vectors[159] = _mm_setr_epi8(1, -1, 2, -1, -1, -1, -1, 0, 4, -1, 5, -1, 6, - -1, 7, 3); - vectors[160] = _mm_setr_epi8(1, -1, 2, -1, 3, -1, -1, 0, -1, -1, -1, -1, -1, - -1, -1, 4); - vectors[161] = _mm_setr_epi8(1, -1, 2, -1, 3, -1, -1, 0, 5, -1, -1, -1, -1, - -1, -1, 4); - vectors[162] = _mm_setr_epi8(1, -1, 2, -1, 3, -1, -1, 0, 5, -1, 6, -1, -1, - -1, -1, 4); - vectors[163] = _mm_setr_epi8(1, -1, 2, -1, 3, -1, -1, 0, 5, -1, 6, -1, 7, - -1, -1, 4); - vectors[164] = _mm_setr_epi8(1, -1, 2, -1, 3, -1, -1, 0, 5, -1, 6, -1, 7, - -1, 8, 4); - vectors[165] = _mm_setr_epi8(1, -1, 2, -1, 3, -1, 4, 0, -1, -1, -1, -1, -1, - -1, -1, 5); - vectors[166] = _mm_setr_epi8(1, -1, 2, -1, 3, -1, 4, 0, 6, -1, -1, -1, -1, - -1, -1, 5); - vectors[167] = _mm_setr_epi8(1, -1, 2, -1, 3, -1, 4, 0, 6, -1, 7, -1, -1, - -1, -1, 5); - vectors[168] = _mm_setr_epi8(1, -1, 2, -1, 3, -1, 4, 0, 6, -1, 7, -1, 8, -1, - -1, 5); - vectors[169] = _mm_setr_epi8(1, -1, 2, -1, 3, -1, 4, 0, 6, -1, 7, -1, 8, -1, - 9, 5); - uint64_t i; - for (i = 0; i < sizeof(bytes_consumed); i++) { - index_bytes_consumed combined = { vec_lookup[i], bytes_consumed[i] }; - combined_lookup[i] = combined; - } - -} - -static uint64_t masked_vbyte_read_group(const uint8_t* in, uint32_t* out, - uint64_t mask, uint64_t* ints_read) { - __m128i initial = _mm_lddqu_si128((const __m128i *) (in)); - __m128i * mout = (__m128i *) out; - - if (!(mask & 0xFFFF)) { - __m128i result = _mm_cvtepi8_epi32(initial); - _mm_storeu_si128(mout, result); - initial = _mm_srli_si128(initial, 4); - result = _mm_cvtepi8_epi32(initial); - _mm_storeu_si128(mout + 1, result); - initial = _mm_srli_si128(initial, 4); - result = _mm_cvtepi8_epi32(initial); - _mm_storeu_si128(mout + 2, result); - initial = _mm_srli_si128(initial, 4); - result = _mm_cvtepi8_epi32(initial); - _mm_storeu_si128(mout + 3, result); - *ints_read = 16; - return 16; - } - - uint32_t low_12_bits = mask & 0xFFF; - // combine index and bytes consumed into a single lookup - index_bytes_consumed combined = combined_lookup[low_12_bits]; - uint64_t consumed = combined.bytes_consumed; - uint8_t index = combined.index; - - __m128i shuffle_vector = vectors[index]; - - if (index < 64) { - *ints_read = 6; - __m128i bytes_to_decode = _mm_shuffle_epi8(initial, shuffle_vector); - __m128i low_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi16(0x007F)); - __m128i high_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi16(0x7F00)); - __m128i high_bytes_shifted = _mm_srli_epi16(high_bytes, 1); - __m128i packed_result = _mm_or_si128(low_bytes, high_bytes_shifted); - __m128i unpacked_result_a = _mm_and_si128(packed_result, - _mm_set1_epi32(0x0000FFFF)); - _mm_storeu_si128(mout, unpacked_result_a); - __m128i unpacked_result_b = _mm_srli_epi32(packed_result, 16); - _mm_storel_epi64(mout+1, unpacked_result_b); - //_mm_storeu_si128(mout + 1, unpacked_result_b); // maybe faster to write 16 bytes? - return consumed; - } - if (index < 145) { - - *ints_read = 4; - - __m128i bytes_to_decode = _mm_shuffle_epi8(initial, shuffle_vector); - __m128i low_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi32(0x0000007F)); - __m128i middle_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi32(0x00007F00)); - __m128i high_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi32(0x007F0000)); - __m128i middle_bytes_shifted = _mm_srli_epi32(middle_bytes, 1); - __m128i high_bytes_shifted = _mm_srli_epi32(high_bytes, 2); - __m128i low_middle = _mm_or_si128(low_bytes, middle_bytes_shifted); - __m128i result = _mm_or_si128(low_middle, high_bytes_shifted); - _mm_storeu_si128(mout, result); - return consumed; - } - - *ints_read = 2; - - __m128i data_bits = _mm_and_si128(initial, _mm_set1_epi8(0x7F)); - __m128i bytes_to_decode = _mm_shuffle_epi8(data_bits, shuffle_vector); - __m128i split_bytes = _mm_mullo_epi16(bytes_to_decode, - _mm_setr_epi16(128, 64, 32, 16, 128, 64, 32, 16)); - __m128i shifted_split_bytes = _mm_slli_epi64(split_bytes, 8); - __m128i recombined = _mm_or_si128(split_bytes, shifted_split_bytes); - __m128i low_byte = _mm_srli_epi64(bytes_to_decode, 56); - __m128i result_evens = _mm_or_si128(recombined, low_byte); - __m128i result = _mm_shuffle_epi8(result_evens, - _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, - -1)); - _mm_storel_epi64(mout, result); - //_mm_storeu_si128(mout, result); // maybe faster to write 16 bytes? - - return consumed; -} - -__m128i PrefixSum(__m128i curr, __m128i prev) { - __m128i Add = _mm_slli_si128(curr, 4); // Cycle 1: [- A B C] (already done) - prev = _mm_shuffle_epi32(prev, 0xff); // Cycle 2: [P P P P] - curr = _mm_add_epi32(curr, Add); // Cycle 2: [A AB BC CD] - Add = _mm_slli_si128(curr, 8); // Cycle 3: [- - A AB] - curr = _mm_add_epi32(curr, prev); // Cycle 3: [PA PAB PBC PCD] - curr = _mm_add_epi32(curr, Add); // Cycle 4: [PA PAB PABC PABCD] - return curr; -} - -// only the first two ints of curr are meaningful, rest is garbage to beignored -__m128i PrefixSum2ints(__m128i curr, __m128i prev) { - __m128i Add = _mm_slli_si128(curr, 4); // Cycle 1: [- A B G] (already done) - prev = _mm_shuffle_epi32(prev, 0xff); // Cycle 2: [P P P P] - curr = _mm_add_epi32(curr, Add); // Cycle 2: [A AB BG GG] - curr = _mm_shuffle_epi32(curr, 0x54); //Cycle 3:[A AB AB AB] - curr = _mm_add_epi32(curr, prev); // Cycle 4: [PA PAB PAB PAB] - return curr; -} - -static uint64_t masked_vbyte_read_group_delta(const uint8_t* in, uint32_t* out, - uint64_t mask, uint64_t* ints_read, __m128i * prev) { - __m128i initial = _mm_lddqu_si128((const __m128i *) (in)); - __m128i * mout = (__m128i *) out; - - if (!(mask & 0xFFFF)) { - __m128i result = _mm_cvtepi8_epi32(initial); - *prev = PrefixSum(result, *prev); - _mm_storeu_si128(mout, *prev); - initial = _mm_srli_si128(initial, 4); - result = _mm_cvtepi8_epi32(initial); - *prev = PrefixSum(result, *prev); - _mm_storeu_si128(mout + 1, *prev); - initial = _mm_srli_si128(initial, 4); - result = _mm_cvtepi8_epi32(initial); - *prev = PrefixSum(result, *prev); - _mm_storeu_si128(mout + 2, *prev); - initial = _mm_srli_si128(initial, 4); - result = _mm_cvtepi8_epi32(initial); - *prev = PrefixSum(result, *prev); - _mm_storeu_si128(mout + 3, *prev); - *ints_read = 16; - return 16; - } - - uint32_t low_12_bits = mask & 0xFFF; - // combine index and bytes consumed into a single lookup - index_bytes_consumed combined = combined_lookup[low_12_bits]; - uint64_t consumed = combined.bytes_consumed; - uint8_t index = combined.index; - - __m128i shuffle_vector = vectors[index]; - - if (index < 64) { - *ints_read = 6; - __m128i bytes_to_decode = _mm_shuffle_epi8(initial, shuffle_vector); - __m128i low_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi16(0x007F)); - __m128i high_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi16(0x7F00)); - __m128i high_bytes_shifted = _mm_srli_epi16(high_bytes, 1); - __m128i packed_result = _mm_or_si128(low_bytes, high_bytes_shifted); - __m128i unpacked_result_a = _mm_and_si128(packed_result, - _mm_set1_epi32(0x0000FFFF)); - *prev = PrefixSum(unpacked_result_a, *prev); - _mm_storeu_si128(mout, *prev); - __m128i unpacked_result_b = _mm_srli_epi32(packed_result, 16); - *prev = PrefixSum2ints(unpacked_result_b, *prev); - _mm_storel_epi64(mout + 1, *prev); - return consumed; - } - if (index < 145) { - - *ints_read = 4; - - __m128i bytes_to_decode = _mm_shuffle_epi8(initial, shuffle_vector); - __m128i low_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi32(0x0000007F)); - __m128i middle_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi32(0x00007F00)); - __m128i high_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi32(0x007F0000)); - __m128i middle_bytes_shifted = _mm_srli_epi32(middle_bytes, 1); - __m128i high_bytes_shifted = _mm_srli_epi32(high_bytes, 2); - __m128i low_middle = _mm_or_si128(low_bytes, middle_bytes_shifted); - __m128i result = _mm_or_si128(low_middle, high_bytes_shifted); - *prev = PrefixSum(result, *prev); - _mm_storeu_si128(mout, *prev); - return consumed; - } - - *ints_read = 2; - - __m128i data_bits = _mm_and_si128(initial, _mm_set1_epi8(0x7F)); - __m128i bytes_to_decode = _mm_shuffle_epi8(data_bits, shuffle_vector); - __m128i split_bytes = _mm_mullo_epi16(bytes_to_decode, - _mm_setr_epi16(128, 64, 32, 16, 128, 64, 32, 16)); - __m128i shifted_split_bytes = _mm_slli_epi64(split_bytes, 8); - __m128i recombined = _mm_or_si128(split_bytes, shifted_split_bytes); - __m128i low_byte = _mm_srli_epi64(bytes_to_decode, 56); - __m128i result_evens = _mm_or_si128(recombined, low_byte); - __m128i result = _mm_shuffle_epi8(result_evens, - _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, - -1)); - *prev = PrefixSum2ints(result, *prev); - _mm_storel_epi64(mout, *prev); - return consumed; -} - - -static int read_int_group(const uint8_t* in, uint32_t* out, int* ints_read) { - - __m128i initial = _mm_lddqu_si128((const __m128i *) in); - __m128i * const mout = (__m128i *) out; - - int mask = _mm_movemask_epi8(initial); - if (mask == 0) { - __m128i result; - result = _mm_cvtepi8_epi32(initial); - initial = _mm_srli_si128(initial, 4); - _mm_storeu_si128(mout, result); - result = _mm_cvtepi8_epi32(initial); - initial = _mm_srli_si128(initial, 4); - _mm_storeu_si128(mout + 1, result); - result = _mm_cvtepi8_epi32(initial); - initial = _mm_srli_si128(initial, 4); - _mm_storeu_si128(mout + 2, result); - result = _mm_cvtepi8_epi32(initial); - _mm_storeu_si128(mout + 3, result); - *ints_read = 16; - return 16; - } - int mask2 = mask & 0xFFF; - index_bytes_consumed combined = combined_lookup[mask2]; - - int index = combined.index; - - __m128i shuffle_vector = vectors[index]; - int consumed = combined.bytes_consumed; - - if (index < 64) { - *ints_read = 6; - __m128i bytes_to_decode = _mm_shuffle_epi8(initial, shuffle_vector); - __m128i low_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi16(0x007F)); - __m128i high_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi16(0x7F00)); - __m128i high_bytes_shifted = _mm_srli_epi16(high_bytes, 1); - __m128i packed_result = _mm_or_si128(low_bytes, high_bytes_shifted); - __m128i unpacked_result_a = _mm_and_si128(packed_result, - _mm_set1_epi32(0x0000FFFF)); - _mm_storeu_si128(mout, unpacked_result_a); - __m128i unpacked_result_b = _mm_srli_epi32(packed_result, 16); - _mm_storel_epi64(mout + 1, unpacked_result_b); - return consumed; - } - if (index < 145) { - - *ints_read = 4; - - __m128i bytes_to_decode = _mm_shuffle_epi8(initial, shuffle_vector); - __m128i low_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi32(0x0000007F)); - __m128i middle_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi32(0x00007F00)); - __m128i high_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi32(0x007F0000)); - __m128i middle_bytes_shifted = _mm_srli_epi32(middle_bytes, 1); - __m128i high_bytes_shifted = _mm_srli_epi32(high_bytes, 2); - __m128i low_middle = _mm_or_si128(low_bytes, middle_bytes_shifted); - __m128i result = _mm_or_si128(low_middle, high_bytes_shifted); - _mm_storeu_si128(mout, result); - return consumed; - } - - *ints_read = 2; - - __m128i data_bits = _mm_and_si128(initial, _mm_set1_epi8(0x7F)); - __m128i bytes_to_decode = _mm_shuffle_epi8(data_bits, shuffle_vector); - __m128i split_bytes = _mm_mullo_epi16(bytes_to_decode, - _mm_setr_epi16(128, 64, 32, 16, 128, 64, 32, 16)); - __m128i shifted_split_bytes = _mm_slli_epi64(split_bytes, 8); - __m128i recombined = _mm_or_si128(split_bytes, shifted_split_bytes); - __m128i low_byte = _mm_srli_epi64(bytes_to_decode, 56); - __m128i result_evens = _mm_or_si128(recombined, low_byte); - __m128i result = _mm_shuffle_epi8(result_evens, - _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, - -1)); - - _mm_storel_epi64(mout, result); - return consumed; -} - - -// len_signed : number of ints we want to decode -size_t masked_vbyte_decode(const uint8_t* in, uint32_t* out, - uint64_t length) { - size_t consumed = 0; // number of bytes read - uint64_t count = 0; // how many integers we have read so far - - uint64_t sig = 0; - int availablebytes = 0; - if (96 < length) { - size_t scanned = 0; - - -#ifdef __AVX2__ - __m256i low = _mm256_loadu_si256((__m256i *)(in + scanned)); - uint32_t lowSig = _mm256_movemask_epi8(low); -#else - __m128i low1 = _mm_loadu_si128((__m128i *) (in + scanned)); - uint32_t lowSig1 = _mm_movemask_epi8(low1); - __m128i low2 = _mm_loadu_si128((__m128i *) (in + scanned + 16)); - uint32_t lowSig2 = _mm_movemask_epi8(low2); - uint32_t lowSig = lowSig2 << 16; - lowSig |= lowSig1; -#endif - - // excess verbosity to avoid problems with sign extension on conversions - // better to think about what's happening and make it clearer - __m128i high = _mm_loadu_si128((__m128i *) (in + scanned + 32)); - uint32_t highSig = _mm_movemask_epi8(high); - uint64_t nextSig = highSig; - nextSig <<= 32; - nextSig |= lowSig; - scanned += 48; - - while (count + 96 < length) { // 96 == 48 + 48 ahead for scanning - uint64_t thisSig = nextSig; - -#ifdef __AVX2__ - low = _mm256_loadu_si256((__m256i *)(in + scanned)); - lowSig = _mm256_movemask_epi8(low); -#else - low1 = _mm_loadu_si128((__m128i *) (in + scanned)); - lowSig1 = _mm_movemask_epi8(low1); - low2 = _mm_loadu_si128((__m128i *) (in + scanned + 16)); - lowSig2 = _mm_movemask_epi8(low2); - lowSig = lowSig2 << 16; - lowSig |= lowSig1; -#endif - - high = _mm_loadu_si128((__m128i *) (in + scanned + 32)); - highSig = _mm_movemask_epi8(high); - nextSig = highSig; - nextSig <<= 32; - nextSig |= lowSig; - - uint64_t remaining = scanned - (consumed + 48); - sig = (thisSig << remaining) | sig; - - uint64_t reload = scanned - 16; - scanned += 48; - - // need to reload when less than 16 scanned bytes remain in sig - while (consumed < reload) { - uint64_t ints_read; - uint64_t bytes = masked_vbyte_read_group(in + consumed, - out + count, sig, &ints_read); - sig >>= bytes; - - // seems like this might force the compiler to prioritize shifting sig >>= bytes - if (sig == 0xFFFFFFFFFFFFFFFF) - return 0; // fake check to force earliest evaluation - - consumed += bytes; - count += ints_read; - } - } - sig = (nextSig << (scanned - consumed - 48)) | sig; - availablebytes = scanned - consumed; - } - while (availablebytes + count < length) { - if (availablebytes < 16) { - if (availablebytes + count + 31 < length) { -#ifdef __AVX2__ - uint64_t newsigavx = (uint32_t) _mm256_movemask_epi8(_mm256_loadu_si256((__m256i *)(in + availablebytes + consumed))); - sig |= (newsigavx << availablebytes); -#else - uint64_t newsig = _mm_movemask_epi8( - _mm_lddqu_si128( - (const __m128i *) (in + availablebytes - + consumed))); - uint64_t newsig2 = _mm_movemask_epi8( - _mm_lddqu_si128( - (const __m128i *) (in + availablebytes + 16 - + consumed))); - sig |= (newsig << availablebytes) - | (newsig2 << (availablebytes + 16)); -#endif - availablebytes += 32; - } else if (availablebytes + count + 15 < length) { - int newsig = _mm_movemask_epi8( - _mm_lddqu_si128( - (const __m128i *) (in + availablebytes - + consumed))); - sig |= newsig << availablebytes; - availablebytes += 16; - } else { - break; - } - } - uint64_t ints_read; - - uint64_t eaten = masked_vbyte_read_group(in + consumed, out + count, - sig, &ints_read); - consumed += eaten; - availablebytes -= eaten; - sig >>= eaten; - count += ints_read; - } - for (; count < length; count++) { - consumed += read_int(in + consumed, out + count); - } - return consumed; -} - - -// inputsize : number of input bytes we want to decode -// returns the number of written ints -size_t masked_vbyte_decode_fromcompressedsize(const uint8_t* in, uint32_t* out, - size_t inputsize) { - size_t consumed = 0; // number of bytes read - uint32_t * initout = out; - - uint64_t sig = 0; - int availablebytes = 0; - if (96 < inputsize) { - size_t scanned = 0; - - -#ifdef __AVX2__ - __m256i low = _mm256_loadu_si256((__m256i *)(in + scanned)); - uint32_t lowSig = _mm256_movemask_epi8(low); -#else - __m128i low1 = _mm_loadu_si128((__m128i *) (in + scanned)); - uint32_t lowSig1 = _mm_movemask_epi8(low1); - __m128i low2 = _mm_loadu_si128((__m128i *) (in + scanned + 16)); - uint32_t lowSig2 = _mm_movemask_epi8(low2); - uint32_t lowSig = lowSig2 << 16; - lowSig |= lowSig1; -#endif - - // excess verbosity to avoid problems with sign extension on conversions - // better to think about what's happening and make it clearer - __m128i high = _mm_loadu_si128((__m128i *) (in + scanned + 32)); - uint32_t highSig = _mm_movemask_epi8(high); - uint64_t nextSig = highSig; - nextSig <<= 32; - nextSig |= lowSig; - scanned += 48; - - while (scanned + 48 <= inputsize) { // 96 == 48 + 48 ahead for scanning - uint64_t thisSig = nextSig; - -#ifdef __AVX2__ - low = _mm256_loadu_si256((__m256i *)(in + scanned)); - lowSig = _mm256_movemask_epi8(low); -#else - low1 = _mm_loadu_si128((__m128i *) (in + scanned)); - lowSig1 = _mm_movemask_epi8(low1); - low2 = _mm_loadu_si128((__m128i *) (in + scanned + 16)); - lowSig2 = _mm_movemask_epi8(low2); - lowSig = lowSig2 << 16; - lowSig |= lowSig1; -#endif - - high = _mm_loadu_si128((__m128i *) (in + scanned + 32)); - highSig = _mm_movemask_epi8(high); - nextSig = highSig; - nextSig <<= 32; - nextSig |= lowSig; - - uint64_t remaining = scanned - (consumed + 48); - sig = (thisSig << remaining) | sig; - - uint64_t reload = scanned - 16; - scanned += 48; - - // need to reload when less than 16 scanned bytes remain in sig - while (consumed < reload) { - uint64_t ints_read; - uint64_t bytes = masked_vbyte_read_group(in + consumed, - out, sig, &ints_read); - sig >>= bytes; - - // seems like this might force the compiler to prioritize shifting sig >>= bytes - if (sig == 0xFFFFFFFFFFFFFFFF) - return 0; // fake check to force earliest evaluation - - consumed += bytes; - out += ints_read; - } - } - sig = (nextSig << (scanned - consumed - 48)) | sig; - availablebytes = scanned - consumed; - } - while (1) { - if (availablebytes < 16) { - if (availablebytes + consumed + 31 < inputsize) { -#ifdef __AVX2__ - uint64_t newsigavx = (uint32_t) _mm256_movemask_epi8(_mm256_loadu_si256((__m256i *)(in + availablebytes + consumed))); - sig |= (newsigavx << availablebytes); -#else - uint64_t newsig = _mm_movemask_epi8( - _mm_lddqu_si128( - (const __m128i *) (in + availablebytes - + consumed))); - uint64_t newsig2 = _mm_movemask_epi8( - _mm_lddqu_si128( - (const __m128i *) (in + availablebytes + 16 - + consumed))); - sig |= (newsig << availablebytes) - | (newsig2 << (availablebytes + 16)); -#endif - availablebytes += 32; - } else if(availablebytes + consumed + 15 < inputsize ) { - int newsig = _mm_movemask_epi8( - _mm_lddqu_si128( - (const __m128i *) (in + availablebytes - + consumed))); - sig |= newsig << availablebytes; - availablebytes += 16; - } else { - break; - } - } - uint64_t ints_read; - uint64_t bytes = masked_vbyte_read_group(in + consumed, out, - sig, &ints_read); - consumed += bytes; - availablebytes -= bytes; - sig >>= bytes; - out += ints_read; - } - while (consumed < inputsize) { - unsigned int shift = 0; - uint32_t v; - for (v = 0; consumed < inputsize; shift += 7) { - uint8_t c = in[consumed++]; - if ((c & 128) == 0) { - out[0] = v + (c << shift); - ++out; - break; - } else { - v += (c & 127) << shift; - } - } - } - return out - initout; -} - - -size_t read_ints(const uint8_t* in, uint32_t* out, int length) { - size_t consumed = 0; - int count; - for (count = 0; count + 15 < length;) { - int ints_read; - consumed += read_int_group(in + consumed, out + count, &ints_read); - count += ints_read; - } - for (; count < length; count++) { - consumed += read_int(in + consumed, out + count); - } - return consumed; -} - -static int read_int_group_delta(const uint8_t* in, uint32_t* out, - int* ints_read, __m128i * prev) { - - __m128i initial = _mm_lddqu_si128((const __m128i *) in); - __m128i * const mout = (__m128i *) out; - - int mask = _mm_movemask_epi8(initial); - if (mask == 0) { - __m128i result; - result = _mm_cvtepi8_epi32(initial); - initial = _mm_srli_si128(initial, 4); - *prev = PrefixSum(result, *prev); - _mm_storeu_si128(mout, *prev); - result = _mm_cvtepi8_epi32(initial); - initial = _mm_srli_si128(initial, 4); - *prev = PrefixSum(result, *prev); - _mm_storeu_si128(mout + 1, *prev); - result = _mm_cvtepi8_epi32(initial); - initial = _mm_srli_si128(initial, 4); - *prev = PrefixSum(result, *prev); - _mm_storeu_si128(mout + 2, *prev); - result = _mm_cvtepi8_epi32(initial); - *prev = PrefixSum(result, *prev); - _mm_storeu_si128(mout + 3, *prev); - *ints_read = 16; - return 16; - } - int mask2 = mask & 0xFFF; - index_bytes_consumed combined = combined_lookup[mask2]; - - int index = combined.index; - - __m128i shuffle_vector = vectors[index]; - int consumed = combined.bytes_consumed; - - if (index < 64) { - *ints_read = 6; - __m128i bytes_to_decode = _mm_shuffle_epi8(initial, shuffle_vector); - __m128i low_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi16(0x007F)); - __m128i high_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi16(0x7F00)); - __m128i high_bytes_shifted = _mm_srli_epi16(high_bytes, 1); - __m128i packed_result = _mm_or_si128(low_bytes, high_bytes_shifted); - __m128i unpacked_result_a = _mm_and_si128(packed_result, - _mm_set1_epi32(0x0000FFFF)); - *prev = PrefixSum(unpacked_result_a, *prev); - _mm_storeu_si128(mout, *prev); - __m128i unpacked_result_b = _mm_srli_epi32(packed_result, 16); - *prev = PrefixSum2ints(unpacked_result_b, *prev); - _mm_storeu_si128(mout + 1, *prev); - return consumed; - } - if (index < 145) { - - *ints_read = 4; - - __m128i bytes_to_decode = _mm_shuffle_epi8(initial, shuffle_vector); - __m128i low_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi32(0x0000007F)); - __m128i middle_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi32(0x00007F00)); - __m128i high_bytes = _mm_and_si128(bytes_to_decode, - _mm_set1_epi32(0x007F0000)); - __m128i middle_bytes_shifted = _mm_srli_epi32(middle_bytes, 1); - __m128i high_bytes_shifted = _mm_srli_epi32(high_bytes, 2); - __m128i low_middle = _mm_or_si128(low_bytes, middle_bytes_shifted); - __m128i result = _mm_or_si128(low_middle, high_bytes_shifted); - *prev = PrefixSum(result, *prev); - _mm_storeu_si128(mout, *prev); - - return consumed; - } - - *ints_read = 2; - - __m128i data_bits = _mm_and_si128(initial, _mm_set1_epi8(0x7F)); - __m128i bytes_to_decode = _mm_shuffle_epi8(data_bits, shuffle_vector); - __m128i split_bytes = _mm_mullo_epi16(bytes_to_decode, - _mm_setr_epi16(128, 64, 32, 16, 128, 64, 32, 16)); - __m128i shifted_split_bytes = _mm_slli_epi64(split_bytes, 8); - __m128i recombined = _mm_or_si128(split_bytes, shifted_split_bytes); - __m128i low_byte = _mm_srli_epi64(bytes_to_decode, 56); - __m128i result_evens = _mm_or_si128(recombined, low_byte); - __m128i result = _mm_shuffle_epi8(result_evens, - _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, - -1)); - *prev = PrefixSum2ints(result, *prev); - _mm_storeu_si128(mout, *prev); - return consumed; -} - - -// len_signed : number of ints we want to decode -size_t masked_vbyte_decode_delta(const uint8_t* in, uint32_t* out, - uint64_t length, uint32_t prev) { - //uint64_t length = (uint64_t) len_signed; // number of ints we want to decode - size_t consumed = 0; // number of bytes read - __m128i mprev = _mm_set1_epi32(prev); - uint64_t count = 0; // how many integers we have read so far - - uint64_t sig = 0; - int availablebytes = 0; - if (96 < length) { - size_t scanned = 0; - - -#ifdef __AVX2__ - __m256i low = _mm256_loadu_si256((__m256i *)(in + scanned)); - uint32_t lowSig = _mm256_movemask_epi8(low); -#else - __m128i low1 = _mm_loadu_si128((__m128i *) (in + scanned)); - uint32_t lowSig1 = _mm_movemask_epi8(low1); - __m128i low2 = _mm_loadu_si128((__m128i *) (in + scanned + 16)); - uint32_t lowSig2 = _mm_movemask_epi8(low2); - uint32_t lowSig = lowSig2 << 16; - lowSig |= lowSig1; -#endif - - // excess verbosity to avoid problems with sign extension on conversions - // better to think about what's happening and make it clearer - __m128i high = _mm_loadu_si128((__m128i *) (in + scanned + 32)); - uint32_t highSig = _mm_movemask_epi8(high); - uint64_t nextSig = highSig; - nextSig <<= 32; - nextSig |= lowSig; - scanned += 48; - - while (count + 96 < length) { // 96 == 48 + 48 ahead for scanning - uint64_t thisSig = nextSig; - -#ifdef __AVX2__ - low = _mm256_loadu_si256((__m256i *)(in + scanned)); - lowSig = _mm256_movemask_epi8(low); -#else - low1 = _mm_loadu_si128((__m128i *) (in + scanned)); - lowSig1 = _mm_movemask_epi8(low1); - low2 = _mm_loadu_si128((__m128i *) (in + scanned + 16)); - lowSig2 = _mm_movemask_epi8(low2); - lowSig = lowSig2 << 16; - lowSig |= lowSig1; -#endif - - high = _mm_loadu_si128((__m128i *) (in + scanned + 32)); - highSig = _mm_movemask_epi8(high); - nextSig = highSig; - nextSig <<= 32; - nextSig |= lowSig; - - uint64_t remaining = scanned - (consumed + 48); - sig = (thisSig << remaining) | sig; - - uint64_t reload = scanned - 16; - scanned += 48; - - // need to reload when less than 16 scanned bytes remain in sig - while (consumed < reload) { - uint64_t ints_read; - uint64_t bytes = masked_vbyte_read_group_delta(in + consumed, - out + count, sig, &ints_read, &mprev); - sig >>= bytes; - - // seems like this might force the compiler to prioritize shifting sig >>= bytes - if (sig == 0xFFFFFFFFFFFFFFFF) - return 0; // fake check to force earliest evaluation - - consumed += bytes; - count += ints_read; - } - } - sig = (nextSig << (scanned - consumed - 48)) | sig; - availablebytes = scanned - consumed; - } - while (availablebytes + count < length) { - if (availablebytes < 16) break; - - if (availablebytes < 16) { - if (availablebytes + count + 31 < length) { -#ifdef __AVX2__ - uint64_t newsigavx = (uint32_t) _mm256_movemask_epi8(_mm256_loadu_si256((__m256i *)(in + availablebytes + consumed))); - sig |= (newsigavx << availablebytes); -#else - uint64_t newsig = _mm_movemask_epi8( - _mm_lddqu_si128( - (const __m128i *) (in + availablebytes - + consumed))); - uint64_t newsig2 = _mm_movemask_epi8( - _mm_lddqu_si128( - (const __m128i *) (in + availablebytes + 16 - + consumed))); - sig |= (newsig << availablebytes) - | (newsig2 << (availablebytes + 16)); -#endif - availablebytes += 32; - } else if (availablebytes + count + 15 < length) { - int newsig = _mm_movemask_epi8( - _mm_lddqu_si128( - (const __m128i *) (in + availablebytes - + consumed))); - sig |= newsig << availablebytes; - availablebytes += 16; - } else { - break; - } - } - uint64_t ints_read; - uint64_t eaten = masked_vbyte_read_group_delta(in + consumed, out + count, - sig, &ints_read, &mprev); - consumed += eaten; - availablebytes -= eaten; - sig >>= eaten; - count += ints_read; - } - prev = _mm_extract_epi32(mprev, 3); - for (; count < length; count++) { - consumed += read_int_delta(in + consumed, out + count, &prev); - } - return consumed; -} - -size_t read_ints_delta(const uint8_t* in, uint32_t* out, int length, - uint32_t prev) { - __m128i mprev = _mm_set1_epi32(prev); - size_t consumed = 0; - int count; - for (count = 0; count + 15 < length;) { - int ints_read; - consumed += read_int_group_delta(in + consumed, out + count, &ints_read, - &mprev); - count += ints_read; - } - prev = _mm_extract_epi32(mprev, 3); - for (; count < length; count++) { - consumed += read_int_delta(in + consumed, out + count, &prev); - } - return consumed; -} - - - -// inputsize : number of input bytes we want to decode -// returns the number of written ints -size_t masked_vbyte_decode_fromcompressedsize_delta(const uint8_t* in, uint32_t* out, - size_t inputsize, uint32_t prev) { - size_t consumed = 0; // number of bytes read - uint32_t * initout = out; - __m128i mprev = _mm_set1_epi32(prev); - uint64_t sig = 0; - int availablebytes = 0; - if (96 < inputsize) { - size_t scanned = 0; - - -#ifdef __AVX2__ - __m256i low = _mm256_loadu_si256((__m256i *)(in + scanned)); - uint32_t lowSig = _mm256_movemask_epi8(low); -#else - __m128i low1 = _mm_loadu_si128((__m128i *) (in + scanned)); - uint32_t lowSig1 = _mm_movemask_epi8(low1); - __m128i low2 = _mm_loadu_si128((__m128i *) (in + scanned + 16)); - uint32_t lowSig2 = _mm_movemask_epi8(low2); - uint32_t lowSig = lowSig2 << 16; - lowSig |= lowSig1; -#endif - - // excess verbosity to avoid problems with sign extension on conversions - // better to think about what's happening and make it clearer - __m128i high = _mm_loadu_si128((__m128i *) (in + scanned + 32)); - uint32_t highSig = _mm_movemask_epi8(high); - uint64_t nextSig = highSig; - nextSig <<= 32; - nextSig |= lowSig; - scanned += 48; - - while (scanned + 48 <= inputsize) { // 96 == 48 + 48 ahead for scanning - uint64_t thisSig = nextSig; - -#ifdef __AVX2__ - low = _mm256_loadu_si256((__m256i *)(in + scanned)); - lowSig = _mm256_movemask_epi8(low); -#else - low1 = _mm_loadu_si128((__m128i *) (in + scanned)); - lowSig1 = _mm_movemask_epi8(low1); - low2 = _mm_loadu_si128((__m128i *) (in + scanned + 16)); - lowSig2 = _mm_movemask_epi8(low2); - lowSig = lowSig2 << 16; - lowSig |= lowSig1; -#endif - - high = _mm_loadu_si128((__m128i *) (in + scanned + 32)); - highSig = _mm_movemask_epi8(high); - nextSig = highSig; - nextSig <<= 32; - nextSig |= lowSig; - - uint64_t remaining = scanned - (consumed + 48); - sig = (thisSig << remaining) | sig; - - uint64_t reload = scanned - 16; - scanned += 48; - - // need to reload when less than 16 scanned bytes remain in sig - while (consumed < reload) { - uint64_t ints_read; - uint64_t bytes = masked_vbyte_read_group_delta(in + consumed, - out, sig, &ints_read, &mprev); - sig >>= bytes; - - // seems like this might force the compiler to prioritize shifting sig >>= bytes - if (sig == 0xFFFFFFFFFFFFFFFF) - return 0; // fake check to force earliest evaluation - - consumed += bytes; - out += ints_read; - } - } - sig = (nextSig << (scanned - consumed - 48)) | sig; - availablebytes = scanned - consumed; - } - while (1) { - if (availablebytes < 16) { - if (availablebytes + consumed + 31 < inputsize) { -#ifdef __AVX2__ - uint64_t newsigavx = (uint32_t) _mm256_movemask_epi8(_mm256_loadu_si256((__m256i *)(in + availablebytes + consumed))); - sig |= (newsigavx << availablebytes); -#else - uint64_t newsig = _mm_movemask_epi8( - _mm_lddqu_si128( - (const __m128i *) (in + availablebytes - + consumed))); - uint64_t newsig2 = _mm_movemask_epi8( - _mm_lddqu_si128( - (const __m128i *) (in + availablebytes + 16 - + consumed))); - sig |= (newsig << availablebytes) - | (newsig2 << (availablebytes + 16)); -#endif - availablebytes += 32; - } else if(availablebytes + consumed + 15 < inputsize ) { - int newsig = _mm_movemask_epi8( - _mm_lddqu_si128( - (const __m128i *) (in + availablebytes - + consumed))); - sig |= newsig << availablebytes; - availablebytes += 16; - } else { - break; - } - } - uint64_t ints_read; - uint64_t bytes = masked_vbyte_read_group_delta(in + consumed, out, - sig, &ints_read, &mprev); - consumed += bytes; - availablebytes -= bytes; - sig >>= bytes; - out += ints_read; - } - prev = _mm_extract_epi32(mprev, 3); - while (consumed < inputsize) { - unsigned int shift = 0; uint32_t v; - for (v = 0; consumed < inputsize; shift += 7) { - uint8_t c = in[consumed++]; - if ((c & 128) == 0) { - uint32_t delta = v + (c << shift); - prev += delta; - *out++ = prev; - break; - } else { - v += (c & 127) << shift; - } - } - } - return out - initout; -} - - diff --git a/ext/MaskedVByte/src/varintencode.c b/ext/MaskedVByte/src/varintencode.c deleted file mode 100644 index dc86969..0000000 --- a/ext/MaskedVByte/src/varintencode.c +++ /dev/null @@ -1,94 +0,0 @@ -#include "../include/varintencode.h" - - -size_t vbyte_encode_delta(uint32_t *in, size_t length, uint8_t *bout, uint32_t prev) { - uint8_t *initbout = bout; size_t k; - for (k = 0; k < length; ++k) { - const uint32_t val = in[k] - prev; - prev = in[k]; - if (val < (1U << 7)) { - *bout = val & 0x7F; - ++bout; - } else if (val < (1U << 14)) { - *bout = (uint8_t)((val & 0x7F) | (1U << 7)); - ++bout; - *bout = (uint8_t)(val >> 7); - ++bout; - } else if (val < (1U << 21)) { - *bout = (uint8_t)((val & 0x7F) | (1U << 7)); - ++bout; - *bout = (uint8_t)(((val >> 7) & 0x7F) | (1U << 7)); - ++bout; - *bout = (uint8_t)(val >> 14); - ++bout; - } else if (val < (1U << 28)) { - *bout = (uint8_t)((val & 0x7F) | (1U << 7)); - ++bout; - *bout = (uint8_t)(((val >> 7) & 0x7F) | (1U << 7)); - ++bout; - *bout = (uint8_t)(((val >> 14) & 0x7F) | (1U << 7)); - ++bout; - *bout = (uint8_t)(val >> 21); - ++bout; - } else { - *bout = (uint8_t)((val & 0x7F) | (1U << 7)); - ++bout; - *bout = (uint8_t)(((val >> 7) & 0x7F) | (1U << 7)); - ++bout; - *bout = (uint8_t)(((val >> 14) & 0x7F) | (1U << 7)); - ++bout; - *bout = (uint8_t)(((val >> 21) & 0x7F) | (1U << 7)); - ++bout; - *bout = (uint8_t)(val >> 28); - ++bout; - } - } - return bout - initbout; -} - -size_t vbyte_encode(uint32_t *in, size_t length, uint8_t *bout) { - uint8_t *initbout = bout; size_t k; - for (k = 0; k < length; ++k) { - const uint32_t val = in[k]; - - if (val < (1U << 7)) { - *bout = val & 0x7F; - ++bout; - } else if (val < (1U << 14)) { - *bout = (uint8_t)((val & 0x7F) | (1U << 7)); - ++bout; - *bout = (uint8_t)(val >> 7); - ++bout; - } else if (val < (1U << 21)) { - *bout = (uint8_t)((val & 0x7F) | (1U << 7)); - ++bout; - *bout = (uint8_t)(((val >> 7) & 0x7F) | (1U << 7)); - ++bout; - *bout = (uint8_t)(val >> 14); - ++bout; - } else if (val < (1U << 28)) { - *bout = (uint8_t)((val & 0x7F) | (1U << 7)); - ++bout; - *bout = (uint8_t)(((val >> 7) & 0x7F) | (1U << 7)); - ++bout; - *bout = (uint8_t)(((val >> 14) & 0x7F) | (1U << 7)); - ++bout; - *bout = (uint8_t)(val >> 21); - ++bout; - } else { - *bout = (uint8_t)((val & 0x7F) | (1U << 7)); - ++bout; - *bout = (uint8_t)(((val >> 7) & 0x7F) | (1U << 7)); - ++bout; - *bout = (uint8_t)(((val >> 14) & 0x7F) | (1U << 7)); - ++bout; - *bout = (uint8_t)(((val >> 21) & 0x7F) | (1U << 7)); - ++bout; - *bout = (uint8_t)(val >> 28); - ++bout; - } - } - return bout - initbout; -} - - diff --git a/ext/OPT_PFD/unpack.h b/ext/OPT_PFD/unpack.h index fa810e9..abb225c 100644 --- a/ext/OPT_PFD/unpack.h +++ b/ext/OPT_PFD/unpack.h @@ -724,7 +724,7 @@ void unpack20(unsigned int *p, unsigned int *w) } -void unpack32(unsigned int *p, unsigned int *w) +static void unpack32(unsigned int *p, unsigned int *w) { int i; diff --git a/ext/bench_/bench/codecs.h b/ext/bench_/bench/codecs.h new file mode 100644 index 0000000..852327e --- /dev/null +++ b/ext/bench_/bench/codecs.h @@ -0,0 +1,172 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + +#ifndef CODECS_H_ +#define CODECS_H_ + +#include "common.h" +#include "util.h" +#include "bitpackinghelpers.h" + +namespace FastPForLib { + +class NotEnoughStorage : public std::runtime_error { +public: + size_t required; // number of 32-bit symbols required + NotEnoughStorage(const size_t req) + : runtime_error(""), required(req){ + + }; +}; + +class IntegerCODEC { +public: + /** + * You specify input and input length, as well as + * output and output length. nvalue gets modified to + * reflect how much was used. If the new value of + * nvalue is more than the original value, we can + * consider this a buffer overrun. + * + * You are responsible for allocating the memory (length + * for *in and nvalue for *out). + */ + virtual void encodeArray(const uint32_t *in, const size_t length, + uint32_t *out, size_t &nvalue) = 0; + + /** + * Usage is similar to decodeArray except that it returns a pointer + * incremented from in. In theory it should be in+length. If the + * returned pointer is less than in+length, then this generally means + * that the decompression is not finished (some scheme compress + * the bulk of the data one way, and they then they compress remaining + * integers using another scheme). + * + * As with encodeArray, you need to have length element allocated + * for *in and at least nvalue elements allocated for out. The value + * of the variable nvalue gets updated with the number actually use + * (if nvalue exceeds the original value, there might be a buffer + * overrun). + */ + virtual const uint32_t *decodeArray(const uint32_t *in, const size_t length, + uint32_t *out, size_t &nvalue) = 0; + virtual ~IntegerCODEC() {} + + /** + * Will compress the content of a vector into + * another vector. + * + * This is offered for convenience. It might be slow. + */ + virtual std::vector compress(const std::vector &data) { + std::vector compresseddata(data.size() * 2 + + 1024); // allocate plenty of memory + size_t memavailable = compresseddata.size(); + encodeArray(&data[0], data.size(), &compresseddata[0], memavailable); + compresseddata.resize(memavailable); + return compresseddata; + } + + /** + * Will uncompress the content of a vector into + * another vector. Some CODECs know exactly how much data to uncompress, + * others need to uncompress it all to know how data there is to uncompress... + * So it useful to have a hint (expected_uncompressed_size) that tells how + * much data there will be to uncompress. Otherwise, the code will + * try to guess, but the result is uncertain and inefficient. You really + * ought to keep track of how many symbols you had compressed. + * + * For convenience. Might be slow. + */ + virtual std::vector + uncompress(const std::vector &compresseddata, + size_t expected_uncompressed_size = 0) { + std::vector data( + expected_uncompressed_size); // allocate plenty of memory + size_t memavailable = data.size(); + try { + decodeArray(&compresseddata[0], compresseddata.size(), &data[0], + memavailable); + } catch (NotEnoughStorage &nes) { + data.resize(nes.required + 1024); + decodeArray(&compresseddata[0], compresseddata.size(), &data[0], + memavailable); + } + data.resize(memavailable); + return data; + } + + virtual std::string name() const = 0; +}; + +/****************** + * This just copies the data, no compression. + */ +class JustCopy : public IntegerCODEC { +public: + void encodeArray(const uint32_t *in, const size_t length, uint32_t *out, + size_t &nvalue) { + memcpy(out, in, sizeof(uint32_t) * length); + nvalue = length; + } + // like encodeArray, but we don't actually copy + void fakeencodeArray(const uint32_t * /*in*/, const size_t length, + size_t &nvalue) { + nvalue = length; + } + + const uint32_t *decodeArray(const uint32_t *in, const size_t length, + uint32_t *out, size_t &nvalue) { + memcpy(out, in, sizeof(uint32_t) * length); + nvalue = length; + return in + length; + } + std::string name() const { return "JustCopy"; } +}; + +/******** + * This uses a single bit width for the whole array. + * It has fast decompression and random access, but + * relatively poor compression. Included as an example. + */ +class PackedCODEC : public IntegerCODEC { +public: + enum { BlockSize = 32 }; + void encodeArray(const uint32_t *in, const size_t length, uint32_t *out, + size_t &nvalue) { + checkifdivisibleby(length, 32); + const uint32_t b = maxbits(in, in + length); + out[0] = static_cast(length); + out[1] = b; + out += 2; + for (uint32_t run = 0; run < length / 32; ++run, in += 32, out += b) { + fastpackwithoutmask(in, out, b); + } + nvalue = 2 + length * b / 32; + } +#ifndef NDEBUG + const uint32_t *decodeArray(const uint32_t *in, const size_t length, +#else + const uint32_t *decodeArray(const uint32_t *in, const size_t /*length*/, +#endif + uint32_t *out, size_t &nvalue) { + nvalue = in[0]; + const uint32_t b = in[1]; + assert(length >= nvalue * b / 32); + in += 2; + for (uint32_t run = 0; run < nvalue / 32; ++run, in += b, out += 32) { + fastunpack(in, out, b); + } + return in; + } + + std::string name() const { return "PackedCODEC"; } +}; + +} // namespace FastPFor + +#endif /* CODECS_H_ */ diff --git a/ext/bench_/bench/common.h b/ext/bench_/bench/common.h new file mode 100644 index 0000000..3d409bb --- /dev/null +++ b/ext/bench_/bench/common.h @@ -0,0 +1,59 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ +#ifndef COMMON_H_ +#define COMMON_H_ + +// C headers (sorted) +#include +#include +#include +#include +#include +#include +#ifndef _WIN32 +#include +#include +#endif +#include +#ifndef _WIN32 +#include +#endif +#include +#include + +// C++ headers (sorted) +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _MSC_VER +#include +#include +#include + +#define __attribute__(n) +#define __restrict__ +#define constexpr inline + +#endif + +#endif /* COMMON_H_ */ diff --git a/ext/bench_/bench/compress.h b/ext/bench_/bench/compress.h new file mode 100644 index 0000000..fa5b1e2 --- /dev/null +++ b/ext/bench_/bench/compress.h @@ -0,0 +1,29 @@ +/* + COMPRESS.H + ---------- +*/ +#ifndef COMPRESS_H_ +#define COMPRESS_H_ + +#include +#define ANT_compressable_integer uint32_t + +/* + class ANT_COMPRESS + ------------------ +*/ +class ANT_compress + { + public: + ANT_compress() {} + virtual ~ANT_compress() {} + + /* + destination_length is in bytes. source_integers is in units of integers, returns the length in bytes + */ + virtual uint64_t compress(uint8_t *destination, uint64_t destination_length, uint32_t *source, uint64_t source_integers) = 0; + virtual void decompress(uint32_t *destination, uint64_t destination_integers, uint8_t *source, uint64_t source_length) = 0; + } ; + +#endif /* COMPRESS_H_ */ + diff --git a/ext/bench_/bench/compress_opt.h b/ext/bench_/bench/compress_opt.h new file mode 100644 index 0000000..c215d84 --- /dev/null +++ b/ext/bench_/bench/compress_opt.h @@ -0,0 +1,65 @@ +/* + COMPRESS_OPT.H + -------------- +*/ +#include "compress.h" +#include "optpfor.h" +#include "compress_variable_byte.h" + +/* + CLASS ANT_COMPRESS_OPT + ---------------------- +*/ +class ANT_compress_opt : public ANT_compress + { + FastPForLib::OPTPFor<128/32,FastPForLib::Simple16> shrinkerator; + ANT_compress_variable_byte compressor; + + + void encodeArray(const uint32_t *in, uint64_t len, uint32_t *out, uint64_t *nvalue) + { + size_t answer = *nvalue; + size_t PFORcompressed; + size_t PFORencoded = (len) - (len % 128); + uint64_t VBYTEencoded = len % 128; + + + if (PFORencoded != 0) + shrinkerator.encodeArray(in, PFORencoded, out, answer); + else + answer = 0; + + // For PFOR stuff, we want the number of uint32_t's, for VByte we want the number of bytes + PFORcompressed = answer; + uint64_t compressed_size = sizeof(uint32_t) * answer; + compressed_size += compressor.compress(((uint8_t *)out) + compressed_size, (*nvalue - PFORcompressed) * sizeof(uint32_t), (uint32_t *)in + PFORencoded, VBYTEencoded); + *nvalue = compressed_size; // return the compressed size in bytes + } + + + void decodeArray(const uint32_t *in, uint64_t len, uint32_t *out, uint64_t nvalue) + { + size_t pfor_integers = nvalue - (nvalue % 128); + uint8_t *source; + + if (pfor_integers != 0) + source = (uint8_t *)shrinkerator.decodeArray(in, pfor_integers, out, pfor_integers); + else + source = (uint8_t *)in; + compressor.decompress(out + pfor_integers, nvalue - pfor_integers, source, len - pfor_integers * sizeof(*out)); + } + + + virtual uint64_t compress(uint8_t *destination, uint64_t destination_length, uint32_t *source, uint64_t source_integers) + { + uint64_t answer = source_integers; + encodeArray(source, source_integers, (uint32_t *)destination, &answer); + return answer; + } + + virtual void decompress(uint32_t *destination, uint64_t destinaton_integers, uint8_t *source, uint64_t source_length) + { + decodeArray((uint32_t *)source, source_length, destination, destinaton_integers); + } + + } ; diff --git a/ext/bench_/bench/compress_qmx.cpp b/ext/bench_/bench/compress_qmx.cpp new file mode 100644 index 0000000..42b0918 --- /dev/null +++ b/ext/bench_/bench/compress_qmx.cpp @@ -0,0 +1,1573 @@ +/* + COMPRESS_QMX.C + -------------- + Copyright (c) 2014 by Andrew Trotman + Licensed BSD + + A version of BinPacking where we pack into a 128-bit SSE register the following: + 256 0-bit words + 128 1-bit words + 64 2-bit words + 40 3-bit words + 32 4-bit words + 24 5-bit words + 20 6-bit words + 16 8-bit words + 12 10-bit words + 8 16-bit words + 4 32-bit words + or pack into two 128-bit words (i.e. 256 bits) the following: + 36 7-bit words + 28 9-bit words + 20 12-bit words + 12 21-bit words + or pach short sequences as: + 1 32-bit word + 1 24-bit word + 1 16-bit word + 1 8-bit word + + This gives 15 possible combinations. The combinaton is stored in the top 4 bits of a selector byte. The + bottom 4-bits of the selector store a run-length (the number of such sequences seen in a row. + + The 128-bit (or 256-bit) packed binary values are stored first. Then we store the selectors, Finally, + stored variable byte encoded, is a pointer to the start of the selector (from the end of the sequence). + + This way, all reads and writes are 128-bit word aligned, except addressing the selector (and the pointer + the selector). These reads are byte aligned. + + Short sequences are encoded using selectors 0xF0-0xFF. The top nybble is the indicator of a short sequence + while the bottom is divided into 2 2-bit numbers, xxyy. xx is the type and yy is the run length. Possible + types for xx are: + 00 8-bit integer + 01 16-bit integer + 10 24-bit integer + 11 32-bit integer + value runlengths for yy are 00, 01, 10, 11. They are the 2's complement of the integer run-length (0-3). +*/ +#include +#include +#include +#include +#include +#include "compress_qmx.h" + +//#define MAKE_DECOMPRESS 1 /* uncomment this and it will create a program that writes the decompressor */ +//#define TEST_ONE_STRING 1 /* Uncomment this and it will create a program that can be used to test the compressor and decompressor */ +#define NO_ZEROS 1 /* stores runs of 256 1s in a row (not 1-bit number, but actual 1 values). */ +#define SHORT_END_BLOCKS 1 + +#ifdef _MSC_VER + #define ALIGN_16 __declspec(align(16)) +#else + #define ALIGN_16 __attribute__ ((aligned (16))) +#endif + +//#define STATS /* uncomment this and it will count the selector usage */ +#ifdef STATS + static uint32_t stats[65] = {0}; +#endif + +/* + ANT_COMPRESS_QMX::ANT_COMPRESS_QMX() + ------------------------------------ +*/ +ANT_compress_qmx::ANT_compress_qmx() +{ +length_buffer = NULL; +length_buffer_length = 0; +} + +/* + ANT_COMPRESS_QMX::~ANT_COMPRESS_QMX() + ------------------------------------- +*/ +ANT_compress_qmx::~ANT_compress_qmx() +{ +delete [] length_buffer; +#ifdef STATS + uint32_t which; + for (which = 0; which <= 32; which++) + if (stats[which] != 0) + printf("%d\t%d\ttimes\n", which, stats[which]); +#endif +} + +/* + BYTES_NEEDED_FOR() + ------------------ +*/ +static uint8_t bytes_needed_for(uint32_t value) +{ +if (value <= 0xFF) + return 1; +else if (value <= 0xFFFF) + return 2; +else if (value <= 0xFFFFFF) + return 3; +else + return 4; +} + + +/* + BITS_NEEDED_FOR() + ----------------- +*/ +static uint8_t bits_needed_for(uint32_t value) +{ +if (value == 0x01) + return 0; +else if (value <= 0x01) + return 1; +else if (value <= 0x03) + return 2; +else if (value <= 0x07) + return 3; +else if (value <= 0x0F) + return 4; +else if (value <= 0x1F) + return 5; +else if (value <= 0x3F) + return 6; +else if (value <= 0x7F) + return 7; +else if (value <= 0xFF) + return 8; +else if (value <= 0x1FF) + return 9; +else if (value <= 0x3FF) + return 10; +else if (value <= 0xFFF) + return 12; +else if (value <= 0xFFFF) + return 16; +else if (value <= 0x1FFFFF) + return 21; +else + return 32; +} + +/* + VBYTE_BYTES_NEEDED_FOR() + ------------------------ +*/ +static inline uint32_t vbyte_bytes_needed_for(uint32_t docno) +{ +if (docno < (1 << 7)) + return 1; +else if (docno < (1 << 14)) + return 2; +else if (docno < (1 << 21)) + return 3; +else if (docno < (1 << 28)) + return 4; +else + return 5; +} + +/* + VBYTE_COMPRESS_INTO() + --------------------- + NOTE: We compress "backwards" because we want to keep decompressing from the end of the string + to get the number +*/ +static inline void vbyte_compress_into(uint8_t *dest, uint32_t docno) +{ +if (docno < (1 << 7)) + dest[0] = (docno & 0x7F) | 0x80; +else if (docno < (1 << 14)) + { + dest[1] = (docno >> 7) & 0x7F; + dest[0] = (docno & 0x7F) | 0x80; + } +else if (docno < (1 << 21)) + { + dest[2] = (docno >> 14) & 0x7F; + dest[1] = (docno >> 7) & 0x7F; + dest[0] = (docno & 0x7F) | 0x80; + } +else if (docno < (1 << 28)) + { + dest[3] = (docno >> 21) & 0x7F; + dest[2] = (docno >> 14) & 0x7F; + dest[1] = (docno >> 7) & 0x7F; + dest[0] = (docno & 0x7F) | 0x80; + } +else + { + dest[4] = (docno >> 28) & 0x7F; + dest[3] = (docno >> 21) & 0x7F; + dest[2] = (docno >> 14) & 0x7F; + dest[1] = (docno >> 7) & 0x7F; + dest[0] = (docno & 0x7F) | 0x80; + } +} + +/* + VBYTE_DECOMPRESS() + ------------------ + NOTE: this method is given a ponter to the end of the v-byte compressed + integer. The task is to work backwards until it gets the integer +*/ +static inline uint32_t vbyte_decompress(uint8_t *source) +{ +uint32_t result; + +if (*source & 0x80) + return *source & 0x7F; +else + { + result = *source--; + + while (!(*source & 0x80)) + result = (result << 7) | *source--; + + return (result << 7) | (*source & 0x7F); + } +} + +/* + WRITE_OUT() + ----------- +*/ +static void write_out(uint8_t **buffer, uint32_t *source, uint32_t raw_count, uint32_t size_in_bits, uint8_t **length_buffer) +{ +uint32_t current, batch; +uint8_t *destination = *buffer; +uint32_t *end = source + raw_count; +uint8_t *key_store = *length_buffer; +uint32_t ALIGN_16 sequence_buffer[4]; +uint32_t instance, value; +uint8_t type; +uint32_t count; + +uint32_t max_bytes = 1; // this is the bytw-width for type128 encoded non-SSE integers + +#ifdef STATS + stats[size_in_bits] += raw_count; +#endif + +if (size_in_bits == 0) + { + type = 0; + count = (raw_count + 255) / 256; + } +else if (size_in_bits == 1) + { + type = 1; // 1 bit per integer + count = (raw_count + 127) / 128; + } +else if (size_in_bits == 2) + { + type = 2; // 2 bits per integer + count = (raw_count + 63) / 64; + } +else if (size_in_bits == 3) + { + type = 3; // 3 bits per integer + count = (raw_count + 39) / 40; + } +else if (size_in_bits == 4) + { + type = 4; // 4 bits per integer + count = (raw_count + 31) / 32; + } +else if (size_in_bits == 5) + { + type = 5; // 5 bits per integer + count = (raw_count + 23) / 24; + } +else if (size_in_bits == 6) + { + type = 6; // 6 bits per integer + count = (raw_count + 19) / 20; + } +else if (size_in_bits == 7) + { + type = 7; // 7 bits per integer, 18 integers per read (but requires 2 reads) + count = (raw_count + 35) / 36; + } +else if (size_in_bits == 8) + { + type = 8; // 8 bits per integer + count = (raw_count + 15) / 16; + } +else if (size_in_bits == 9) + { + type = 9; // 9 bits per integer, 14 integers per read (but requires 2 reads) + count = (raw_count + 27) / 28; + } +else if (size_in_bits == 10) + { + type = 10; // 10 bits per integer + count = (raw_count + 11) / 12; + } +else if (size_in_bits == 12) + { + type = 11; // 12 bits per integer, 10 integers per read (but requires 2 reads) + count = (raw_count + 19) / 20; + } +else if (size_in_bits == 16) + { + type = 12; // 16 bits per integer + count = (raw_count + 7) / 8; + } +else if (size_in_bits == 21) + { + type = 13; // 21 bits per integer, 6 integers per read (but requires 2 reads) + count = (raw_count + 11) / 12; + } +else if (size_in_bits == 32) + { + type = 14; // 32 bits per integer + count = (raw_count + 3) / 4; + } +else if (size_in_bits == 128) + { + type = 15; + count = raw_count; + /* + As the count for type 128 can only be 1, 2, or 3, we can re-appropriate it and store the bit-length in there too. + */ + max_bytes = 1; + for (uint32_t integer = 0; integer < count; integer++) + { + if (bytes_needed_for(source[integer]) > max_bytes) + max_bytes = bytes_needed_for(source[integer]); + } + } +else + exit(printf("Can't compress into integers of size %dbits\n", size_in_bits)); + +while (count > 0) + { + batch = count > 16 ? 16 : count; + *key_store++ = (type << 4) | (~(batch - 1) & 0x0F); + + count -= batch; + + for (current = 0; current < batch; current++) + { + switch (size_in_bits) + { + case 0: // 0 bits per integer (i.e. a long sequence of zeros) + /* + In this case we don't need to store a 4 byte integer because its implicit + */ + source += 256; + break; + case 1: // 1 bit per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 128; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 1); + + memcpy(destination, sequence_buffer, 16); + destination += 16; + source += 128; + break; + case 2: // 2 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 64; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 2); + + memcpy(destination, sequence_buffer, 16); + destination += 16; + source += 64; + break; + case 3: // 3 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 40; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 3); + + memcpy(destination, sequence_buffer, 16); + destination += 16; + source += 40; + break; + case 4: // 4 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 32; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 4); + + memcpy(destination, sequence_buffer, 16); + destination += 16; + source += 32; + break; + case 5: // 5 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 24; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 5); + + memcpy(destination, sequence_buffer, 16); + destination += 16; + source += 24; + break; + case 6: // 6 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 20; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 6); + memcpy(destination, sequence_buffer, 16); + destination += 16; + source += 20; + break; + case 7: // 7 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 20; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 7); + memcpy(destination, sequence_buffer, 16); + destination += 16; + + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 16; value < 20; value++) + sequence_buffer[value & 0x03] |= source[value] >> 4; + for (value = 20; value < 36; value++) + sequence_buffer[value & 0x03] |= source[value] << (((value - 20) / 4) * 7 + 3); + memcpy(destination, sequence_buffer, 16); + + destination += 16; + source += 36; // 36 in a double 128-bit word + break; + case 8: // 8 bits per integer +#ifdef SHORT_END_BLOCKS + for (instance = 0; instance < 16 && source < end; instance++) +#else + for (instance = 0; instance < 16; instance++) +#endif + *destination++ = (uint8_t)*source++; + break; + case 9: // 9 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 16; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 9); + memcpy(destination, sequence_buffer, 16); + destination += 16; + + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 12; value < 16; value++) + sequence_buffer[value & 0x03] |= source[value] >> 5; + for (value = 16; value < 28; value++) + sequence_buffer[value & 0x03] |= source[value] << (((value - 16) / 4) * 9 + 4); + memcpy(destination, sequence_buffer, 16); + + destination += 16; + source += 28; // 28 in a double 128-bit word + break; + case 10: // 10 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 12; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 10); + + memcpy(destination, sequence_buffer, 16); + destination += 16; + source += 12; + break; + case 12: // 12 bit integers + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 12; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 12); + memcpy(destination, sequence_buffer, 16); + destination += 16; + + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 8; value < 12; value++) + sequence_buffer[value & 0x03] |= source[value] >> 8; + for (value = 12; value < 20; value++) + sequence_buffer[value & 0x03] |= source[value] << (((value - 12) / 4) * 12 + 8); + memcpy(destination, sequence_buffer, 16); + + destination += 16; + source += 20; // 20 in a double 128-bit word + break; + case 16: // 16 bits per integer +#ifdef SHORT_END_BLOCKS + for (instance = 0; instance < 8 && source < end; instance++) +#else + for (instance = 0; instance < 8; instance++) +#endif + { + *(uint16_t *)destination = (uint16_t)*source++; + destination += 2; + } + break; + case 21: // 21 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 8; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 21); + memcpy(destination, sequence_buffer, 16); + destination += 16; + + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 4; value < 8; value++) + sequence_buffer[value & 0x03] |= source[value] >> 11; + for (value = 8; value < 12; value++) + sequence_buffer[value & 0x03] |= source[value] << (((value - 8) / 4) * 21 + 11); + memcpy(destination, sequence_buffer, 16); + + destination += 16; + source += 12; // 12 in a double 128-bit word + break; + case 32: // 32 bits per integer +#ifdef SHORT_END_BLOCKS + for (instance = 0; instance < 4 && source < end; instance++) +#else + for (instance = 0; instance < 4; instance++) +#endif + { + *(uint32_t *)destination = (uint32_t)*source++; + destination += 4; + } + break; + case 128: + if (max_bytes == 1) + { + *(uint8_t *)destination = (uint8_t)*source; + source++; + destination += 1; + *(key_store - 1) = (type << 4) | (~(batch - 1) & 0x03); + } + else if (max_bytes == 2) + { + *(uint16_t *)destination = (uint16_t)*source; + source++; + destination += 2; + *(key_store - 1) = (type << 4) | 4 | (~(batch - 1) & 0x03); + } + else if (max_bytes == 3) + { + *destination++ = (uint8_t)((*source >> 16) & 0xFF); + *destination++ = (uint8_t)((*source >> 8) & 0xFF); + *destination++ = (uint8_t)((*source >> 0) & 0xFF); + source++; + + *(key_store - 1) = (type << 4) | 8 | (~(batch - 1) & 0x03); + } + else if (max_bytes == 4) + { + *(uint32_t *)destination = (uint32_t)*source; + source++; + destination += 4; + *(key_store - 1) = (type << 4) | 0x0C | (~(batch - 1) & 0x03); + } + else + printf("max_bytes must be 1, 2, 3, or 4, but is:%d", (int)max_bytes); + break; + } + } + } +*buffer = destination; +*length_buffer = key_store; +} + +/* + MAX() + ----- +*/ +template +T max(T a, T b) +{ +return a > b ? a : b; +} + +/* + MAX() + ----- +*/ +template +T max(T a, T b, T c, T d) +{ +return max(max(a, b), max(c, d)); +} + +/* + ANT_COMPRESS_QMX::ENCODEARRAY() + ------------------------------- +*/ +void ANT_compress_qmx::encodeArray(const uint32_t *source, uint64_t source_integers, uint32_t *into, uint64_t *nvalue) +{ +const uint32_t WASTAGE = 512; +uint8_t *current_length, *destination = (uint8_t *)into, *keys; +uint32_t *current, run_length, bits, new_needed, wastage; +uint32_t block, largest; + +/* + make sure we have enough room to store the lengths +*/ +if (length_buffer_length < source_integers) + { + delete [] length_buffer; + length_buffer = new uint8_t [(size_t)(length_buffer_length = source_integers) + WASTAGE]; + } + +/* + Get the lengths of the integers +*/ +current_length = length_buffer; +for (current = (uint32_t *)source; current < source + source_integers; current++) + *current_length++ = bits_needed_for(*current); + +/* + Shove a bunch of 0 length integers on the end to allow for overflow +*/ +for (wastage = 0; wastage < WASTAGE; wastage++) + *current_length++ = 0; + +/* + Process the lengths. To maximise SSE throughput we need each write to be 128-bit (4*32-bit) alignned + and therefore we need each compress "block" to be the same size where a compress "block" is a set of + four encoded integers starting on a 4-integer boundary. +*/ +for (current_length = length_buffer; current_length < length_buffer + source_integers + 4; current_length += 4) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = max(*current_length, *(current_length + 1), *(current_length + 2), *(current_length + 3)); + +/* + This code makes sure we can do aligned reads, promoting to larger integers if necessary +*/ +current_length = length_buffer; +while (current_length < length_buffer + source_integers) + { +#ifdef SHORT_END_BLOCKS + /* + If there are fewer than 16 values remaining and they all fit into 8-bits then its smaller than storing stripes + If there are fewer than 8 values remaining and they all fit into 16-bits then its smaller than storing stripes + If there are fewer than 4 values remaining and they all fit into 32-bits then its smaller than storing stripes + */ + if (source_integers - (current_length - length_buffer) < 4) + { + largest = 0; + for (block = 0; block < 8; block++) + largest = max((uint8_t)largest, *(current_length + block)); + if (largest <= 8) + for (block = 0; block < 8; block++) + *(current_length + block) = 8; + else if (largest <= 16) + for (block = 0; block < 8; block++) + *(current_length + block) = 16; + else if (largest <= 32) + for (block = 0; block < 8; block++) + *(current_length + block) = 32; + } + else if (source_integers - (current_length - length_buffer) < 8) + { + largest = 0; + for (block = 0; block < 8; block++) + largest = max((uint8_t)largest, *(current_length + block)); + if (largest <= 8) + for (block = 0; block < 8; block++) + *(current_length + block) = 8; + else if (largest <= 8) + for (block = 0; block < 8; block++) + *(current_length + block) = 16; + } + else if (source_integers - (current_length - length_buffer) < 16) + { + largest = 0; + for (block = 0; block < 16; block++) + largest = max((uint8_t)largest, *(current_length + block)); + if (largest <= 8) + for (block = 0; block < 16; block++) + *(current_length + block) = 8; + } + /* + Otherwise we have the standard rules for a block + */ +#endif + /* + Two things need to happen to be able to use a particular selector. The first is that all the + values that would end up in that block need to use at most the bit value of that block. + The second is that there need to be at least as many numbers remaining as the block encodes. + + For example, if the current block only needs 0-bits per int, then check that the 256 values + that would be encoded only take 0-bits. If any value needs more, or there aren't 256 numbers remaining, + then promote the current block to try encode 128 1-bit values. + */ + switch (*current_length) + { + case 0: + if (source_integers - (current_length - length_buffer) < 256) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 1; // promote + break; + } + for (block = 0; block < 256; block += 4) + if (*(current_length + block) > 0) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 1; // promote + if (*current_length == 0) + { + for (block = 0; block < 256; block++) + current_length[block] = 0; + current_length += 256; + } + break; + case 1: + if (source_integers - (current_length - length_buffer) < 128) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 2; // promote + break; + } + for (block = 0; block < 128; block += 4) + if (*(current_length + block) > 1) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 2; // promote + if (*current_length == 1) + { + for (block = 0; block < 128; block++) + current_length[block] = 1; + current_length += 128; + } + break; + case 2: + if (source_integers - (current_length - length_buffer) < 64) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 3; // promote + break; + } + for (block = 0; block < 64; block += 4) + if (*(current_length + block) > 2) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 3; // promote + if (*current_length == 2) + { + for (block = 0; block < 64; block++) + current_length[block] = 2; + current_length += 64; + } + break; + case 3: + if (source_integers - (current_length - length_buffer) < 40) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 4; // promote + break; + } + for (block = 0; block < 40; block += 4) + if (*(current_length + block) > 3) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 4; // promote + if (*current_length == 3) + { + for (block = 0; block < 40; block++) + current_length[block] = 3; + current_length += 40; + } + break; + case 4: + if (source_integers - (current_length - length_buffer) < 32) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 5; // promote + break; + } + for (block = 0; block < 32; block += 4) + if (*(current_length + block) > 4) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 5; // promote + if (*current_length == 4) + { + for (block = 0; block < 32; block++) + current_length[block] = 4; + current_length += 32; + } + break; + case 5: + if (source_integers - (current_length - length_buffer) < 24) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 6; // promote + break; + } + for (block = 0; block < 24; block += 4) + if (*(current_length + block) > 5) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 6; // promote + if (*current_length == 5) + { + for (block = 0; block < 24; block++) + current_length[block] = 5; + current_length += 24; + } + break; + case 6: + if (source_integers - (current_length - length_buffer) < 20) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 7; // promote + break; + } + for (block = 0; block < 20; block += 4) + if (*(current_length + block) > 6) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 7; // promote + if (*current_length == 6) + { + for (block = 0; block < 20; block++) + current_length[block] = 6; + current_length += 20; + } + break; + case 7: + if (source_integers - (current_length - length_buffer) < 36) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 8; // promote + break; + } + for (block = 0; block < 36; block += 4) // 36 in a double 128-bit word + if (*(current_length + block) > 7) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 8; // promote + if (*current_length == 7) + { + for (block = 0; block < 36; block++) + current_length[block] = 7; + current_length += 36; + } + break; + case 8: + if (source_integers - (current_length - length_buffer) < 16) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 9; // promote + break; + } + for (block = 0; block < 16; block += 4) + if (*(current_length + block) > 8) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 9; // promote + if (*current_length == 8) + { + for (block = 0; block < 16; block++) + current_length[block] = 8; + current_length += 16; + } + break; + case 9: + if (source_integers - (current_length - length_buffer) < 28) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 10; // promote + break; + } + for (block = 0; block < 28; block += 4) // 28 in a double 128-bit word + if (*(current_length + block) > 9) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 10; // promote + if (*current_length == 9) + { + for (block = 0; block < 28; block++) + current_length[block] = 9; + current_length += 28; + } + break; + case 10: + if (source_integers - (current_length - length_buffer) < 12) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 12; // promote + break; + } + for (block = 0; block < 12; block += 4) + if (*(current_length + block) > 10) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 12; // promote + if (*current_length == 10) + { + for (block = 0; block < 12; block++) + current_length[block] = 10; + current_length += 12; + } + break; + case 12: + if (source_integers - (current_length - length_buffer) < 20) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 16; // promote + break; + } + for (block = 0; block < 20; block += 4) // 20 in a double 128-bit word + if (*(current_length + block) > 12) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 16; // promote + if (*current_length == 12) + { + for (block = 0; block < 20; block++) + current_length[block] = 12; + current_length += 20; + } + break; + case 16: + if (source_integers - (current_length - length_buffer) < 8) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 21; // promote + break; + } + for (block = 0; block < 8; block += 4) + if (*(current_length + block) > 16) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 21; // promote + if (*current_length == 16) + { + for (block = 0; block < 8; block++) + current_length[block] = 16; + current_length += 8; + } + break; + case 21: + if (source_integers - (current_length - length_buffer) < 12) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 32; // promote + break; + } + for (block = 0; block < 12; block += 4) // 12 in a double 128-bit word + if (*(current_length + block) > 21) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 32; // promote + if (*current_length == 21) + { + for (block = 0; block < 12; block++) + current_length[block] = 21; + current_length += 12; + } + break; + case 32: + if (source_integers - (current_length - length_buffer) < 4) + { + for (block = 0; block < (source_integers - (current_length - length_buffer)); block++) + *(current_length + block) = 128; // promote + break; + } + for (block = 0; block < 4; block += 4) + if (*(current_length + block) > 32) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 64; // promote + if (*current_length == 32) + { + for (block = 0; block < 4; block++) + current_length[block] = 32; + current_length += 4; + } + break; + case 128: + /* + The 128-bit selector is used as a last resort when there are not enough numbers to use an + earlier selector. So don't worry about checking the rest. + */ + current_length += source_integers - (current_length - length_buffer); + break; + default: + exit(printf("Selecting on a non whole power of 2, must exit\n")); + break; + } + } + +/* + We can now compress based on the lengths in length_buffer +*/ +run_length = 1; +bits = length_buffer[0]; +keys = length_buffer; // we're going to re-use the length_buffer because it can't overlap and this saves a double malloc +for (current = (uint32_t *)source + 1; current < source + source_integers; current++) + { + new_needed = length_buffer[current - source]; + if (new_needed == bits) + run_length++; + else + { + write_out(&destination, (uint32_t *)current - run_length, run_length, bits, &keys); + bits = new_needed; + run_length = 1; + } + } +write_out(&destination, (uint32_t *)current - run_length, run_length, bits, &keys); + +/* + Copy the lengths to the end +*/ +memcpy(destination, length_buffer, keys - length_buffer); +destination += keys - length_buffer; + +/* + Add the pointer to the lengths +*/ +uint32_t val = keys - length_buffer + vbyte_bytes_needed_for(keys - length_buffer); // offset (from the end) to the start of the keys +if (vbyte_bytes_needed_for(val) > vbyte_bytes_needed_for(keys - length_buffer)) + val = keys - length_buffer + vbyte_bytes_needed_for(val); // although rare, this happens when adding the length of the vbyte encoded length makes the vbyte encoding one byte longer (i.e. 127) +vbyte_compress_into(destination, val); + +destination += vbyte_bytes_needed_for(val); + + +/* + Compute the length (in bytes) +*/ +*nvalue = destination - (uint8_t *)into; // return length in bytes +} + +#ifdef MAKE_DECOMPRESS + /* + The following program generates the source code for ANT_compress_qmx::decodeArray() + */ + /* + MAIN() + ------ + This version assumes SSE4.1 and so it is *not* portable to non X86 architectures + */ + int main(void) + { + uint32_t instance; + + + printf("static uint32_t ALIGN_16 static_mask_21[] = {0x1fffff, 0x1fffff, 0x1fffff, 0x1fffff};\n"); + printf("static uint32_t ALIGN_16 static_mask_12[] = {0xfff, 0xfff, 0xfff, 0xfff};\n"); + printf("static uint32_t ALIGN_16 static_mask_10[] = {0x3ff, 0x3ff, 0x3ff, 0x3ff};\n"); + printf("static uint32_t ALIGN_16 static_mask_9[] = {0x1ff, 0x1ff, 0x1ff, 0x1ff};\n"); + printf("static uint32_t ALIGN_16 static_mask_7[] = {0x7f, 0x7f, 0x7f, 0x7f};\n"); + printf("static uint32_t ALIGN_16 static_mask_6[] = {0x3f, 0x3f, 0x3f, 0x3f};\n"); + printf("static uint32_t ALIGN_16 static_mask_5[] = {0x1f, 0x1f, 0x1f, 0x1f};\n"); + printf("static uint32_t ALIGN_16 static_mask_4[] = {0x0f, 0x0f, 0x0f, 0x0f};\n"); + printf("static uint32_t ALIGN_16 static_mask_3[] = {0x07, 0x07, 0x07, 0x07};\n"); + printf("static uint32_t ALIGN_16 static_mask_2[] = {0x03, 0x03, 0x03, 0x03};\n"); + printf("static uint32_t ALIGN_16 static_mask_1[] = {0x01, 0x01, 0x01, 0x01};\n"); + printf("void ANT_compress_qmx::decodeArray(const uint32_t *source, uint64_t len, uint32_t *to, uint64_t destination_integers)\n"); + printf("{\n"); + printf("__m128i byte_stream, byte_stream_2, tmp, tmp2, mask_21, mask_12, mask_10, mask_9, mask_7, mask_6, mask_5, mask_4, mask_3, mask_2, mask_1;\n"); + printf("uint8_t *in = (uint8_t *)source;\n"); + printf("uint32_t *end = to + destination_integers;\n"); + printf("uint32_t key_start = vbyte_decompress((uint8_t *)source + len - 1);\n"); + printf("uint8_t *keys = (uint8_t *)source + len - key_start;\n"); + + printf("\n"); + printf("mask_21 = _mm_loadu_si128((__m128i *)static_mask_21);\n"); + printf("mask_12 = _mm_loadu_si128((__m128i *)static_mask_12);\n"); + printf("mask_10 = _mm_loadu_si128((__m128i *)static_mask_10);\n"); + printf("mask_9 = _mm_loadu_si128((__m128i *)static_mask_9);\n"); + printf("mask_7 = _mm_loadu_si128((__m128i *)static_mask_7);\n"); + printf("mask_6 = _mm_loadu_si128((__m128i *)static_mask_6);\n"); + printf("mask_5 = _mm_loadu_si128((__m128i *)static_mask_5);\n"); + printf("mask_4 = _mm_loadu_si128((__m128i *)static_mask_4);\n"); + printf("mask_3 = _mm_loadu_si128((__m128i *)static_mask_3);\n"); + printf("mask_2 = _mm_loadu_si128((__m128i *)static_mask_2);\n"); + printf("mask_1 = _mm_loadu_si128((__m128i *)static_mask_1);\n"); + printf("\n"); + + printf("while (to < end)\n"); + printf("\t{\n"); + printf("\tswitch (*keys++)\n"); + printf("\t\t{\n"); + + for (instance = 0; instance <= 0xFF; instance++) + { + printf("\t\tcase 0x%02x:\n", instance); + if ((instance >> 4) == 0) + { + /* + 256 0-bit integers + */ + printf("#ifdef NO_ZEROS\n"); + printf("\t\t\ttmp = _mm_loadu_si128((__m128i *)static_mask_1);\n"); + printf("#else\n"); + printf("\t\t\ttmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));\n"); + printf("#endif\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 5, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 6, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 7, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 8, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 9, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 10, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 11, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 12, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 13, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 14, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 15, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 16, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 17, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 18, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 19, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 20, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 21, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 22, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 23, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 24, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 25, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 26, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 27, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 28, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 29, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 30, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 31, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 32, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 33, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 34, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 35, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 36, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 37, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 38, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 39, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 40, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 41, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 42, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 43, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 44, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 45, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 46, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 47, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 48, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 49, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 50, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 51, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 52, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 53, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 54, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 55, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 56, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 57, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 58, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 59, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 60, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 61, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 62, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 63, tmp);\n"); + printf("\t\t\tto += 256;\n"); // becomes 256 integers + } + else if (instance >> 4 == 1) + { + /* + 128 * 1-bit integers + */ + printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));\n"); + + printf("\t\t\tin += 16;\n"); // 16 bytes + printf("\t\t\tto += 128;\n"); // becomes 128 integers + } + else if (instance >> 4 == 2) + { + /* + 64 * 2-bit integers + */ + printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));\n"); + + printf("\t\t\tin += 16;\n"); // 16 bytes + printf("\t\t\tto += 64;\n"); // becomes 64 integers + } + else if (instance >> 4 == 3) + { + /* + 40 * 3-bit integers + */ + printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));\n"); + + printf("\t\t\tin += 16;\n"); // 16 bytes + printf("\t\t\tto += 40;\n"); // becomes 40 integers + } + else if (instance >> 4 == 4) + { + /* + 32 * 4-bit integers + */ + printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 4);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 4);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 4);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 4);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 4);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 4);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 4);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));\n"); + + printf("\t\t\tin += 16;\n"); // 16 bytes + printf("\t\t\tto += 32;\n"); // becomes 32 integers + } + else if (instance >> 4 == 5) + { + /* + 24 * 5-bit integers + */ + printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 5);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 5);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 5);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 5);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 5);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));\n"); + + printf("\t\t\tin += 16;\n"); // 16 bytes + printf("\t\t\tto += 24;\n"); // becomes 24 integers + } + else if (instance >> 4 == 6) + { + /* + 20 * 6-bit integers + */ + printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 6);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 6);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 6);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 6);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));\n"); + + printf("\t\t\tin += 16;\n"); // 16 bytes + printf("\t\t\tto += 20;\n"); // becomes 20 integers + } + else if (instance >> 4 == 7) + { + /* + 36 * 7 bit integers (in two 128-bit words) + */ + printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 7);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 7);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 7);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));\n"); + + printf("\t\t\tbyte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream_2, 3);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 7);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 7);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 7);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));\n"); + + printf("\t\t\tin += 32;\n"); // 32 bytes + printf("\t\t\tto += 36;\n"); // becomes 36 integers + } + else if (instance >> 4 == 8) + { + /* + 16 * 8-bit integers + */ + printf("\t\t\ttmp = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));\n"); + printf("\t\t\ttmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));\n"); + printf("\t\t\ttmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));\n"); + printf("\t\t\ttmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));\n"); + + printf("\t\t\tin += 16;\n"); // 16 bytes + printf("\t\t\tto += 16;\n"); // becomes 16 integers + } + else if (instance >> 4 == 9) + { + /* + 28 * 9-bit ingtegers (in two 128-bit words) + */ + printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 9);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 9);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));\n"); + printf("\t\t\tbyte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream_2, 4);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 9);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 9);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));\n"); + printf("\t\t\tin += 32;\n"); // 32 bytes + printf("\t\t\tto += 28;\n"); // becomes 28 integers + } + else if (instance >> 4 == 10) + { + /* + 12 * 10-bit integers + */ + printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 10);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 10);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));\n"); + + printf("\t\t\tin += 16;\n"); // 16 bytes + printf("\t\t\tto += 12;\n"); // becomes 12 integers + } + else if (instance >> 4 == 11) + { + /* + 20 * 12-bit ingtegers (in two 128-bit words) + */ + printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 12);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));\n"); + printf("\t\t\tbyte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream_2, 8);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 12);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));\n"); + + printf("\t\t\tin += 32;\n"); // 32 bytes + printf("\t\t\tto += 20;\n"); // becomes 20 integers + } + else if (instance >> 4 == 12) + { + /* + 16-bit integers + */ + printf("\t\t\ttmp = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));\n"); + + printf("\t\t\tin += 16;\n"); // 16 bytes + printf("\t\t\tto += 8;\n"); // becomes 8 integers + } + else if (instance >> 4 == 13) + { + /* + 12 * 21-bit ingtegers (in two 128-bit words) + */ + printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));\n"); + printf("\t\t\tbyte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));\n"); + + printf("\t\t\tin += 32;\n"); // 32 bytes + printf("\t\t\tto += 12;\n"); // becomes 8 integers + } + else if (instance >> 4 == 14) + { + /* + 32-bit integers + */ + printf("\t\t\ttmp = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to, tmp);\n"); + + printf("\t\t\tin += 16;\n"); // 16 bytes + printf("\t\t\tto += 4;\n"); // becomes 4 integers + } + else if (instance >> 4 == 15) + { + /* + 128-bit integers + if there are fewer than 4 integes then we just bit-pack them in to 8, 16, 24, or 32-bit words + */ + if ((instance & 0x0C) == 0x00) + { + printf("\t\t\t*to = *(uint8_t *)in;\n"); + printf("\t\t\tin += 1;\n"); // 1 byte integer + printf("\t\t\tto += 1;\n"); // becomes 1 integer + } + else if ((instance & 0x0C) == 0x04) + { + printf("\t\t\t*to = *(uint16_t *)in;\n"); + printf("\t\t\tin += 2;\n"); // 2 byte integers + printf("\t\t\tto += 1;\n"); // becomes 1 integer + } + else if ((instance & 0x0C) == 0x08) + { + printf("\t\t\t*to = (*(uint8_t *)in << 16) | (*(uint8_t *)(in + 1) << 8) | (*(uint8_t *)(in + 2));\n"); + printf("\t\t\tin += 3;\n"); // 3 byte integer + printf("\t\t\tto += 1;\n"); // becomes 1 integer + } + else if ((instance & 0x0C) == 0x0C) + { + printf("\t\t\t*to = *(uint32_t *)in;\n"); + printf("\t\t\tin += 4;\n"); // 4 byte integer + printf("\t\t\tto += 1;\n"); // becomes 1 integer + } + if (instance == 0xFF || instance == 0xFB || instance == 0xF7 || instance == 0xF3) + printf("\t\t\tbreak;\n"); + } + else + { + printf("\t\t\tin++;\n"); // dummy, can't occur + } + if ((instance & 0xF) == 0xF) + printf("\t\t\tbreak;\n"); // every 32 instances we break (its the end of the fall through) + } + printf("\t\t}\n"); + printf("\t}\n"); + printf("}\n"); + } +#endif + +#ifdef TEST_ONE_STRING + static uint32_t sequence[]={0x80, 0x80FF, 0x80FFFF}; + static uint32_t sequence_unused[]={13,1,1,26,18,3,1,9,4,8,5,19,7,26,1,5,7,3,12,5,39,16,3,5,19,8,18,1,1,1,2,5,9,3,21,2,6,37,3,5,5,18,3,31,3,22,5,17,6,12,6,2,5,10,3,12,51,14,7,8,1,2,3,27,19,1,10,8,2,7,2,9,16,6,6,5,6,4,18,21,13,2,1,11,3,22,2,16,13,61,21,12,51,10,6,31,14,65,15,82,5,4,18,3,1,1,4,34,5,9,4,7,1,25,17,52,60,8,8,4,22,7,49,26,2,72,29,33,6,11,3,8,1,23,37,1,3,1,1,1,3,20,6,1,2,1,1,1,14,2,4,1,6,4,4,3,1,1,2,2,1,9,29,1,10,11,4,10,31}; + + static uint32_t second_compress_buffer[100000]; + static uint32_t second_decompress_buffer[100000]; + + uint32_t second_compress_buffer_size = sizeof(second_compress_buffer) / sizeof(*second_compress_buffer); + uint32_t second_decompress_buffer_size = sizeof(second_decompress_buffer) / sizeof(*second_decompress_buffer); + + /* + CHECK() + ------- + */ + void check(uint32_t *sequence, uint32_t sequence_length) + { + ANT_compress_qmx compressor; + uint64_t buffer_size; + uint32_t pos; + uint32_t fail; + + memset(second_compress_buffer, 0, second_compress_buffer_size); + memset(second_decompress_buffer, 0, second_decompress_buffer_size); + + compressor.encodeArray(sequence, sequence_length, (uint32_t *)second_compress_buffer, &buffer_size); + printf("%u integers became %u bytes\n", sequence_length, buffer_size); + + second_compress_buffer[buffer_size] = 0; + second_compress_buffer[buffer_size + 1] = 0; + second_compress_buffer[buffer_size + 2] = 0; + second_compress_buffer[buffer_size + 3] = 0; + + for (pos = 0; pos < buffer_size; pos++) + printf("%02X ", ((uint8_t *)second_compress_buffer)[pos]); + puts(""); + + compressor.decodeArray((uint32_t *)second_compress_buffer, buffer_size, (uint32_t *)second_decompress_buffer, sequence_length); + + fail = false; + for (pos = 0; pos < sequence_length; pos++) + if (sequence[pos] != second_decompress_buffer[pos]) + { + printf("p[%d]:%X != %X\n", pos, sequence[pos], second_decompress_buffer[pos]); + fail = true; + } + else + printf("p[%d]:%X == %X\n", pos, sequence[pos], second_decompress_buffer[pos]); + + if (fail) + puts("Test failed"); + else + puts("Test succeeded"); + } + + /* + MAIN() + ------ + */ + int main(void) + { + check(sequence, sizeof(sequence) / sizeof(*sequence)); + } +#endif +/* + ANT_COMPRESS_QMX::DECODEARRAY() + -------------------------------- + this code was generated by the method above. +*/ +#include "compress_qmx_decompress.cpp" diff --git a/ext/bench_/bench/compress_qmx.h b/ext/bench_/bench/compress_qmx.h new file mode 100644 index 0000000..83c1ea6 --- /dev/null +++ b/ext/bench_/bench/compress_qmx.h @@ -0,0 +1,43 @@ +/* + COMPRESS_QMX.H + -------------- + Original QMX with overflow removal added by Matt Crane +*/ +#ifndef COMPRESS_QMX_H_ +#define COMPRESS_QMX_H_ + +#include +#include "compress.h" + +/* + class ANT_COMPRESS_QMX + ---------------------- +*/ +class ANT_compress_qmx : public ANT_compress +{ +private: + uint8_t *length_buffer; + uint64_t length_buffer_length; + +public: + ANT_compress_qmx(); + virtual ~ANT_compress_qmx(); + + void encodeArray(const uint32_t *in, uint64_t len, uint32_t *out, uint64_t *nvalue); + void decodeArray(const uint32_t *in, uint64_t len, uint32_t *out, uint64_t nvalue); + + virtual uint64_t compress(uint8_t *destination, uint64_t destination_length, uint32_t *source, uint64_t source_integers) + { + uint64_t answer; + encodeArray(source, source_integers, (uint32_t *)destination, &answer); + return answer; + } + + virtual void decompress(uint32_t *destination, uint64_t destinaton_integers, uint8_t *source, uint64_t source_length) + { + decodeArray((uint32_t *)source, source_length, destination, destinaton_integers); + } +} ; + +#endif + diff --git a/ext/bench_/bench/compress_qmx_adcs.cpp b/ext/bench_/bench/compress_qmx_adcs.cpp new file mode 100644 index 0000000..74b8e85 --- /dev/null +++ b/ext/bench_/bench/compress_qmx_adcs.cpp @@ -0,0 +1,6700 @@ +/* + COMPRESS_QMX_ADCS.CPP + -------------- + Copyright (c) 2014 by Andrew Trotman + Licensed BSD + + A version of BinPacking where we pack into a 128-bit SSE register the following: + 256 0-bit words + 128 1-bit words + 64 2-bit words + 40 3-bit words + 32 4-bit words + 24 5-bit words + 20 6-bit words + 16 8-bit words + 12 10-bit words + 8 16-bit words + 4 32-bit words + or pack into two 128-bit words (i.e. 256 bits) the following: + 36 7-bit words + 28 9-bit words + 20 12-bit words + 12 21-bit words + + This gives us 15 possible combinations. The combinaton is stored in the top 4 bits of a selector byte. The + bottom 4-bits of the selector store a run-length (the number of such sequences seen in a row. + + The 128-bit (or 256-bit) packed binary values are stored first. Then we store the selectors, Finally, + stored variable byte encoded, is a pointer to the start of the selector (from the end of the sequence). + + This way, all reads and writes are 128-bit word aligned, except addressing the selector (and the pointer + the selector). These reads are byte aligned. + + Note: There is currently 1 unused encoding (i.e. 16 unused selecvtor values). These might in the future be + used for encoding exceptions, much as PForDelta does. +*/ +#include +#include +#include +#include +#include +#include "compress_qmx_adcs.h" + +//#define MAKE_DECOMPRESS 1 /* uncomment this and it will create a program that writes the decompressor */ +//#define TEST_ONE_STRING 1 /* Uncomment this and it will create a program that can be used to test the compressor and decompressor */ +#define NO_ZEROS 1 /* stores runs of 256 1s in a row (not 1-bit number, but actual 1 values). */ +#define SHORT_END_BLOCKS 1 + +#ifdef _MSC_VER + #define ALIGN_16 __declspec(align(16)) +#else + #define ALIGN_16 __attribute__ ((aligned (16))) +#endif + +//#define STATS /* uncomment this and it will count the selector usage */ +#ifdef STATS + static uint32_t stats[65] = {0}; +#endif + +/* + COMPRESS_QMX_ADCS::COMPRESS_QMX_ADCS() + ---------------------------- +*/ +ANT_compress_qmx_adcs::ANT_compress_qmx_adcs() +{ +length_buffer = NULL; +length_buffer_length = 0; +} + +/* + COMPRESS_QMX_ADCS::!COMPRESS_QMX_ADCS() + ----------------------------- +*/ +ANT_compress_qmx_adcs::~ANT_compress_qmx_adcs() +{ +delete [] length_buffer; +#ifdef STATS + uint32_t which; + for (which = 0; which <= 32; which++) + if (stats[which] != 0) + printf("%d\t%d\ttimes\n", which, stats[which]); +#endif +} + +/* + BITS_NEEDED_FOR() + ----------------- +*/ +static uint8_t bits_needed_for(uint32_t value) +{ +if (value == 0x01) + return 0; +else if (value <= 0x01) + return 1; +else if (value <= 0x03) + return 2; +else if (value <= 0x07) + return 3; +else if (value <= 0x0F) + return 4; +else if (value <= 0x1F) + return 5; +else if (value <= 0x3F) + return 6; +else if (value <= 0x7F) + return 7; +else if (value <= 0xFF) + return 8; +else if (value <= 0x1FF) + return 9; +else if (value <= 0x3FF) + return 10; +else if (value <= 0xFFF) + return 12; +else if (value <= 0xFFFF) + return 16; +else if (value <= 0x1FFFFF) + return 21; +else + return 32; +} + +/* + VBYTE_BYTES_NEEDED_FOR() + ------------------------ +*/ +static inline uint32_t vbyte_bytes_needed_for(uint32_t docno) +{ +if (docno < (1 << 7)) + return 1; +else if (docno < (1 << 14)) + return 2; +else if (docno < (1 << 21)) + return 3; +else if (docno < (1 << 28)) + return 4; +else + return 5; +} + +/* + VBYTE_COMPRESS_INTO() + --------------------- + NOTE: We compress "backwards" because we want to keep decompressing from the end of the string + to get the number +*/ +static inline void vbyte_compress_into(uint8_t *dest, uint32_t docno) +{ +if (docno < (1 << 7)) + dest[0] = (docno & 0x7F) | 0x80; +else if (docno < (1 << 14)) + { + dest[1] = (docno >> 7) & 0x7F; + dest[0] = (docno & 0x7F) | 0x80; + } +else if (docno < (1 << 21)) + { + dest[2] = (docno >> 14) & 0x7F; + dest[1] = (docno >> 7) & 0x7F; + dest[0] = (docno & 0x7F) | 0x80; + } +else if (docno < (1 << 28)) + { + dest[3] = (docno >> 21) & 0x7F; + dest[2] = (docno >> 14) & 0x7F; + dest[1] = (docno >> 7) & 0x7F; + dest[0] = (docno & 0x7F) | 0x80; + } +else + { + dest[4] = (docno >> 28) & 0x7F; + dest[3] = (docno >> 21) & 0x7F; + dest[2] = (docno >> 14) & 0x7F; + dest[1] = (docno >> 7) & 0x7F; + dest[0] = (docno & 0x7F) | 0x80; + } +} + +/* + VBYTE_DECOMPRESS() + ------------------ + NOTE: this method is given a ponter to the end of the v-byte compressed + integer. The task is to work backwards until it gets the integer +*/ +static inline uint32_t vbyte_decompress(uint8_t *source) +{ +uint32_t result; + +if (*source & 0x80) + return *source & 0x7F; +else + { + result = *source--; + + while (!(*source & 0x80)) + result = (result << 7) | *source--; + + return (result << 7) | (*source & 0x7F); + } +} + +/* + WRITE_OUT() + ----------- +*/ +static void write_out(uint8_t **buffer, uint32_t *source, uint32_t raw_count, uint32_t size_in_bits, uint8_t **length_buffer) +{ +uint32_t current, batch; +uint8_t *destination = *buffer; +uint32_t *end = source + raw_count; +uint8_t *key_store = *length_buffer; +uint32_t ALIGN_16 sequence_buffer[4]; +uint32_t instance, value; +uint8_t type; +uint32_t count; + +#ifdef STATS + stats[size_in_bits] += raw_count; +#endif + +if (size_in_bits == 0) + { + type = 0; + count = (raw_count + 255) / 256; + } +else if (size_in_bits == 1) + { + type = 1; // 1 bit per integer + count = (raw_count + 127) / 128; + } +else if (size_in_bits == 2) + { + type = 2; // 2 bits per integer + count = (raw_count + 63) / 64; + } +else if (size_in_bits == 3) + { + type = 3; // 3 bits per integer + count = (raw_count + 39) / 40; + } +else if (size_in_bits == 4) + { + type = 4; // 4 bits per integer + count = (raw_count + 31) / 32; + } +else if (size_in_bits == 5) + { + type = 5; // 5 bits per integer + count = (raw_count + 23) / 24; + } +else if (size_in_bits == 6) + { + type = 6; // 6 bits per integer + count = (raw_count + 19) / 20; + } +else if (size_in_bits == 7) + { + type = 7; // 7 bits per integer, 18 integers per read (but requires 2 reads) + count = (raw_count + 35) / 36; + } +else if (size_in_bits == 8) + { + type = 8; // 8 bits per integer + count = (raw_count + 15) / 16; + } +else if (size_in_bits == 9) + { + type = 9; // 9 bits per integer, 14 integers per read (but requires 2 reads) + count = (raw_count + 27) / 28; + } +else if (size_in_bits == 10) + { + type = 10; // 10 bits per integer + count = (raw_count + 11) / 12; + } +else if (size_in_bits == 12) + { + type = 11; // 12 bits per integer, 10 integers per read (but requires 2 reads) + count = (raw_count + 19) / 20; + } +else if (size_in_bits == 16) + { + type = 12; // 16 bits per integer + count = (raw_count + 7) / 8; + } +else if (size_in_bits == 21) + { + type = 13; // 21 bits per integer, 6 integers per read (but requires 2 reads) + count = (raw_count + 11) / 12; + } +else if (size_in_bits == 32) + { + type = 14; // 32 bits per integer + count = (raw_count + 3) / 4; + } +else + exit(printf("Can't compress into integers of size %dbits\n", size_in_bits)); + +while (count > 0) + { + batch = count > 16 ? 16 : count; + *key_store++ = (type << 4) | (~(batch - 1) & 0x0F); + + count -= batch; + + for (current = 0; current < batch; current++) + { + switch (size_in_bits) + { + case 0: // 0 bits per integer (i.e. a long sequence of zeros) + /* + In this case we don't need to store a 4 byte integer because its implicit + */ + source += 256; + break; + case 1: // 1 bit per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 128; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 1); + + memcpy(destination, sequence_buffer, 16); + destination += 16; + source += 128; + break; + case 2: // 2 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 64; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 2); + + memcpy(destination, sequence_buffer, 16); + destination += 16; + source += 64; + break; + case 3: // 3 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 40; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 3); + + memcpy(destination, sequence_buffer, 16); + destination += 16; + source += 40; + break; + case 4: // 4 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 32; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 4); + + memcpy(destination, sequence_buffer, 16); + destination += 16; + source += 32; + break; + case 5: // 5 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 24; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 5); + + memcpy(destination, sequence_buffer, 16); + destination += 16; + source += 24; + break; + case 6: // 6 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 20; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 6); + memcpy(destination, sequence_buffer, 16); + destination += 16; + source += 20; + break; + case 7: // 7 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 20; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 7); + memcpy(destination, sequence_buffer, 16); + destination += 16; + + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 16; value < 20; value++) + sequence_buffer[value & 0x03] |= source[value] >> 4; + for (value = 20; value < 36; value++) + sequence_buffer[value & 0x03] |= source[value] << (((value - 20) / 4) * 7 + 3); + memcpy(destination, sequence_buffer, 16); + + destination += 16; + source += 36; // 36 in a double 128-bit word + break; + case 8: // 8 bits per integer +#ifdef SHORT_END_BLOCKS + for (instance = 0; instance < 16 && source < end; instance++) +#else + for (instance = 0; instance < 16; instance++) +#endif + *destination++ = (uint8_t)*source++; + break; + case 9: // 9 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 16; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 9); + memcpy(destination, sequence_buffer, 16); + destination += 16; + + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 12; value < 16; value++) + sequence_buffer[value & 0x03] |= source[value] >> 5; + for (value = 16; value < 28; value++) + sequence_buffer[value & 0x03] |= source[value] << (((value - 16) / 4) * 9 + 4); + memcpy(destination, sequence_buffer, 16); + + destination += 16; + source += 28; // 28 in a double 128-bit word + break; + case 10: // 10 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 12; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 10); + + memcpy(destination, sequence_buffer, 16); + destination += 16; + source += 12; + break; + case 12: // 12 bit integers + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 12; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 12); + memcpy(destination, sequence_buffer, 16); + destination += 16; + + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 8; value < 12; value++) + sequence_buffer[value & 0x03] |= source[value] >> 8; + for (value = 12; value < 20; value++) + sequence_buffer[value & 0x03] |= source[value] << (((value - 12) / 4) * 12 + 8); + memcpy(destination, sequence_buffer, 16); + + destination += 16; + source += 20; // 20 in a double 128-bit word + break; + case 16: // 16 bits per integer +#ifdef SHORT_END_BLOCKS + for (instance = 0; instance < 8 && source < end; instance++) +#else + for (instance = 0; instance < 8; instance++) +#endif + { + *(uint16_t *)destination = (uint16_t)*source++; + destination += 2; + } + break; + case 21: // 21 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 8; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 21); + memcpy(destination, sequence_buffer, 16); + destination += 16; + + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 4; value < 8; value++) + sequence_buffer[value & 0x03] |= source[value] >> 11; + for (value = 8; value < 12; value++) + sequence_buffer[value & 0x03] |= source[value] << (((value - 8) / 4) * 21 + 11); + memcpy(destination, sequence_buffer, 16); + + destination += 16; + source += 12; // 12 in a double 128-bit word + break; + case 32: // 32 bits per integer +#ifdef SHORT_END_BLOCKS + for (instance = 0; instance < 4 && source < end; instance++) +#else + for (instance = 0; instance < 4; instance++) +#endif + { + *(uint32_t *)destination = (uint32_t)*source++; + destination += 4; + } + break; + } + } + } +*buffer = destination; +*length_buffer = key_store; +} + +/* + MAX() + ----- +*/ +template +T max(T a, T b) +{ +return a > b ? a : b; +} + +/* + MAX() + ----- +*/ +template +T max(T a, T b, T c, T d) +{ +return max(max(a, b), max(c, d)); +} + +/* + COMPRESS_QMX_ADCS::ENCODEARRAY() + --------------------------- +*/ +void ANT_compress_qmx_adcs::encodeArray(const uint32_t *source, uint64_t source_integers, uint32_t *into, uint64_t *nvalue) +{ +const uint32_t WASTAGE = 512; +uint8_t *current_length, *destination = (uint8_t *)into, *keys; +uint32_t *current, run_length, bits, new_needed, wastage; +uint32_t block, largest; + +/* + make sure we have enough room to store the lengths +*/ +if (length_buffer_length < source_integers) + { + delete [] length_buffer; + length_buffer = new uint8_t [(size_t)((length_buffer_length = source_integers) + WASTAGE)]; + } + +/* + Get the lengths of the integers +*/ +current_length = length_buffer; +for (current = (uint32_t *)source; current < source + source_integers; current++) + *current_length++ = bits_needed_for(*current); + +/* + Shove a bunch of 0 length integers on the end to allow for overflow +*/ +for (wastage = 0; wastage < WASTAGE; wastage++) + *current_length++ = 0; + +/* + Process the lengths. To maximise SSE throughput we need each write to be 128-bit (4*32-bit) alignned + and therefore we need each compress "block" to be the same size where a compress "block" is a set of + four encoded integers starting on a 4-integer boundary. +*/ +for (current_length = length_buffer; current_length < length_buffer + source_integers + 4; current_length += 4) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = max(*current_length, *(current_length + 1), *(current_length + 2), *(current_length + 3)); + +/* + This code makes sure we can do aligned reads, promoting to larger integers if necessary +*/ +current_length = length_buffer; +while (current_length < length_buffer + source_integers) + { +#ifdef SHORT_END_BLOCKS + /* + If there are fewer than 16 values remaining and they all fit into 8-bits then its smaller than storing stripes + If there are fewer than 8 values remaining and they all fit into 16-bits then its smaller than storing stripes + If there are fewer than 4 values remaining and they all fit into 32-bits then its smaller than storing stripes + */ + if (source_integers - (current_length - length_buffer) < 4) + { + largest = 0; + for (block = 0; block < 8; block++) + largest = max((uint8_t)largest, *(current_length + block)); + if (largest <= 8) + for (block = 0; block < 8; block++) + *(current_length + block) = 8; + else if (largest <= 16) + for (block = 0; block < 8; block++) + *(current_length + block) = 16; + else if (largest <= 32) + for (block = 0; block < 8; block++) + *(current_length + block) = 32; + } + else if (source_integers - (current_length - length_buffer) < 8) + { + largest = 0; + for (block = 0; block < 8; block++) + largest = max((uint8_t)largest, *(current_length + block)); + if (largest <= 8) + for (block = 0; block < 8; block++) + *(current_length + block) = 8; + else if (largest <= 8) + for (block = 0; block < 8; block++) + *(current_length + block) = 16; + } + else if (source_integers - (current_length - length_buffer) < 16) + { + largest = 0; + for (block = 0; block < 16; block++) + largest = max((uint8_t)largest, *(current_length + block)); + if (largest <= 8) + for (block = 0; block < 16; block++) + *(current_length + block) = 8; + } + /* + Otherwise we have the standard rules for a block + */ +#endif + switch (*current_length) + { + case 0: + for (block = 0; block < 256; block += 4) + if (*(current_length + block) > 0) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 1; // promote + if (*current_length == 0) + { + for (block = 0; block < 256; block++) + current_length[block] = 0; + current_length += 256; + } + break; + case 1: + for (block = 0; block < 128; block += 4) + if (*(current_length + block) > 1) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 2; // promote + if (*current_length == 1) + { + for (block = 0; block < 128; block++) + current_length[block] = 1; + current_length += 128; + } + break; + case 2: + for (block = 0; block < 64; block += 4) + if (*(current_length + block) > 2) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 3; // promote + if (*current_length == 2) + { + for (block = 0; block < 64; block++) + current_length[block] = 2; + current_length += 64; + } + break; + case 3: + for (block = 0; block < 40; block += 4) + if (*(current_length + block) > 3) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 4; // promote + if (*current_length == 3) + { + for (block = 0; block < 40; block++) + current_length[block] = 3; + current_length += 40; + } + break; + case 4: + for (block = 0; block < 32; block += 4) + if (*(current_length + block) > 4) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 5; // promote + if (*current_length == 4) + { + for (block = 0; block < 32; block++) + current_length[block] = 4; + current_length += 32; + } + break; + case 5: + for (block = 0; block < 24; block += 4) + if (*(current_length + block) > 5) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 6; // promote + if (*current_length == 5) + { + for (block = 0; block < 24; block++) + current_length[block] = 5; + current_length += 24; + } + break; + case 6: + for (block = 0; block < 20; block += 4) + if (*(current_length + block) > 6) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 7; // promote + if (*current_length == 6) + { + for (block = 0; block < 20; block++) + current_length[block] = 6; + current_length += 20; + } + break; + case 7: + for (block = 0; block < 36; block += 4) // 36 in a double 128-bit word + if (*(current_length + block) > 7) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 8; // promote + if (*current_length == 7) + { + for (block = 0; block < 36; block++) + current_length[block] = 7; + current_length += 36; + } + break; + case 8: + for (block = 0; block < 16; block += 4) + if (*(current_length + block) > 8) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 9; // promote + if (*current_length == 8) + { + for (block = 0; block < 16; block++) + current_length[block] = 8; + current_length += 16; + } + break; + case 9: + for (block = 0; block < 28; block += 4) // 28 in a double 128-bit word + if (*(current_length + block) > 9) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 10; // promote + if (*current_length == 9) + { + for (block = 0; block < 28; block++) + current_length[block] = 9; + current_length += 28; + } + break; + case 10: + for (block = 0; block < 12; block += 4) + if (*(current_length + block) > 10) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 12; // promote + if (*current_length == 10) + { + for (block = 0; block < 12; block++) + current_length[block] = 10; + current_length += 12; + } + break; + case 12: + for (block = 0; block < 20; block += 4) // 20 in a double 128-bit word + if (*(current_length + block) > 12) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 16; // promote + if (*current_length == 12) + { + for (block = 0; block < 20; block++) + current_length[block] = 12; + current_length += 20; + } + break; + case 16: + for (block = 0; block < 8; block += 4) + if (*(current_length + block) > 16) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 21; // promote + if (*current_length == 16) + { + for (block = 0; block < 8; block++) + current_length[block] = 16; + current_length += 8; + } + break; + case 21: + for (block = 0; block < 12; block += 4) // 12 in a double 128-bit word + if (*(current_length + block) > 21) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 32; // promote + if (*current_length == 21) + { + for (block = 0; block < 12; block++) + current_length[block] = 21; + current_length += 12; + } + break; + case 32: + for (block = 0; block < 4; block += 4) + if (*(current_length + block) > 32) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 64; // promote + if (*current_length == 32) + { + for (block = 0; block < 4; block++) + current_length[block] = 32; + current_length += 4; + } + break; + default: + exit(printf("Selecting on a non whole power of 2, must exit\n")); + break; + } + } + +/* + We can now compress based on the lengths in length_buffer +*/ +run_length = 1; +bits = length_buffer[0]; +keys = length_buffer; // we're going to re-use the length_buffer because it can't overlap and this saves a double malloc +for (current = (uint32_t *)source + 1; current < source + source_integers; current++) + { + new_needed = length_buffer[current - source]; + if (new_needed == bits) + run_length++; + else + { + write_out(&destination, (uint32_t *)current - run_length, run_length, bits, &keys); + bits = new_needed; + run_length = 1; + } + } +write_out(&destination, (uint32_t *)current - run_length, run_length, bits, &keys); + +/* + Copy the lengths to the end +*/ +memcpy(destination, length_buffer, keys - length_buffer); +destination += keys - length_buffer; + +/* + Add the pointer to the lengths +*/ +uint32_t val = keys - length_buffer + vbyte_bytes_needed_for(keys - length_buffer); // offset (from the end) to the start of the keys +if (vbyte_bytes_needed_for(val) > vbyte_bytes_needed_for(keys - length_buffer)) + val = keys - length_buffer + vbyte_bytes_needed_for(val); // although rare, this happens when adding the length of the vbyte encoded length makes the vbyte encoding one byte longer (i.e. 127) +vbyte_compress_into(destination, val); + +destination += vbyte_bytes_needed_for(val); + + +/* + Compute the length (in bytes) +*/ +*nvalue = destination - (uint8_t *)into; // return length in bytes +} + +#ifdef MAKE_DECOMPRESS + /* + The following program generates the source code for compress_runlength::decodeArray() + */ + /* + MAIN() + ------ + This version assumes SSE4.1 and so it is *not* portable to non X86 architectures + */ + int main(void) + { + uint32_t instance; + + + printf("static uint32_t ALIGN_16 static_mask_21[] = {0x1fffff, 0x1fffff, 0x1fffff, 0x1fffff};\n"); + printf("static uint32_t ALIGN_16 static_mask_12[] = {0xfff, 0xfff, 0xfff, 0xfff};\n"); + printf("static uint32_t ALIGN_16 static_mask_10[] = {0x3ff, 0x3ff, 0x3ff, 0x3ff};\n"); + printf("static uint32_t ALIGN_16 static_mask_9[] = {0x1ff, 0x1ff, 0x1ff, 0x1ff};\n"); + printf("static uint32_t ALIGN_16 static_mask_7[] = {0x7f, 0x7f, 0x7f, 0x7f};\n"); + printf("static uint32_t ALIGN_16 static_mask_6[] = {0x3f, 0x3f, 0x3f, 0x3f};\n"); + printf("static uint32_t ALIGN_16 static_mask_5[] = {0x1f, 0x1f, 0x1f, 0x1f};\n"); + printf("static uint32_t ALIGN_16 static_mask_4[] = {0x0f, 0x0f, 0x0f, 0x0f};\n"); + printf("static uint32_t ALIGN_16 static_mask_3[] = {0x07, 0x07, 0x07, 0x07};\n"); + printf("static uint32_t ALIGN_16 static_mask_2[] = {0x03, 0x03, 0x03, 0x03};\n"); + printf("static uint32_t ALIGN_16 static_mask_1[] = {0x01, 0x01, 0x01, 0x01};\n"); + printf("void compress_qmx::decodeArray(const uint32_t *source, uint64_t len, uint32_t *to, uint64_t destination_integers)\n"); + printf("{\n"); + printf("__m128i byte_stream, byte_stream_2, tmp, tmp2, mask_21, mask_12, mask_10, mask_9, mask_7, mask_6, mask_5, mask_4, mask_3, mask_2, mask_1;\n"); + printf("uint8_t *in = (uint8_t *)source;\n"); + printf("uint32_t *end = to + destination_integers;\n"); + printf("uint32_t key_start = vbyte_decompress((uint8_t *)source + len - 1);\n"); + printf("uint8_t *keys = (uint8_t *)source + len - key_start;\n"); + + printf("\n"); + printf("mask_21 = _mm_load_si128((__m128i *)static_mask_21);\n"); + printf("mask_12 = _mm_load_si128((__m128i *)static_mask_12);\n"); + printf("mask_10 = _mm_load_si128((__m128i *)static_mask_10);\n"); + printf("mask_9 = _mm_load_si128((__m128i *)static_mask_9);\n"); + printf("mask_7 = _mm_load_si128((__m128i *)static_mask_7);\n"); + printf("mask_6 = _mm_load_si128((__m128i *)static_mask_6);\n"); + printf("mask_5 = _mm_load_si128((__m128i *)static_mask_5);\n"); + printf("mask_4 = _mm_load_si128((__m128i *)static_mask_4);\n"); + printf("mask_3 = _mm_load_si128((__m128i *)static_mask_3);\n"); + printf("mask_2 = _mm_load_si128((__m128i *)static_mask_2);\n"); + printf("mask_1 = _mm_load_si128((__m128i *)static_mask_1);\n"); + printf("\n"); + + printf("while (to < end)\n"); + printf("\t{\n"); + printf("\tswitch (*keys++)\n"); + printf("\t\t{\n"); + + for (instance = 0; instance <= 0xFF; instance++) + { + printf("\t\tcase 0x%02x:\n", instance); + if ((instance >> 4) == 0) + { + /* + 256 0-bit integers + */ + printf("#ifdef NO_ZEROS\n"); + printf("\t\t\ttmp = _mm_load_si128((__m128i *)static_mask_1);\n"); + printf("#else\n"); + printf("\t\t\ttmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));\n"); + printf("#endif\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 1, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 2, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 3, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 4, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 5, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 6, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 7, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 8, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 9, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 10, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 11, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 12, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 13, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 14, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 15, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 16, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 17, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 18, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 19, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 20, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 21, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 22, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 23, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 24, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 25, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 26, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 27, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 28, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 29, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 30, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 31, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 32, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 33, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 34, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 35, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 36, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 37, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 38, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 39, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 40, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 41, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 42, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 43, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 44, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 45, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 46, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 47, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 48, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 49, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 50, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 51, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 52, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 53, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 54, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 55, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 56, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 57, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 58, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 59, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 60, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 61, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 62, tmp);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 63, tmp);\n"); + printf("\t\t\tto += 256;\n"); // becomes 256 integers + } + else if (instance >> 4 == 1) + { + /* + 128 * 1-bit integers + */ + printf("\t\t\tbyte_stream = _mm_load_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));\n"); + + printf("\t\t\tin += 16;\n"); // 16 bytes + printf("\t\t\tto += 128;\n"); // becomes 128 integers + } + else if (instance >> 4 == 2) + { + /* + 64 * 2-bit integers + */ + printf("\t\t\tbyte_stream = _mm_load_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));\n"); + + printf("\t\t\tin += 16;\n"); // 16 bytes + printf("\t\t\tto += 64;\n"); // becomes 64 integers + } + else if (instance >> 4 == 3) + { + /* + 40 * 3-bit integers + */ + printf("\t\t\tbyte_stream = _mm_load_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));\n"); + + printf("\t\t\tin += 16;\n"); // 16 bytes + printf("\t\t\tto += 40;\n"); // becomes 40 integers + } + else if (instance >> 4 == 4) + { + /* + 32 * 4-bit integers + */ + printf("\t\t\tbyte_stream = _mm_load_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 4);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 4);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 4);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 4);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 4);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 4);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 4);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));\n"); + + printf("\t\t\tin += 16;\n"); // 16 bytes + printf("\t\t\tto += 32;\n"); // becomes 32 integers + } + else if (instance >> 4 == 5) + { + /* + 24 * 5-bit integers + */ + printf("\t\t\tbyte_stream = _mm_load_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 5);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 5);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 5);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 5);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 5);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));\n"); + + printf("\t\t\tin += 16;\n"); // 16 bytes + printf("\t\t\tto += 24;\n"); // becomes 24 integers + } + else if (instance >> 4 == 6) + { + /* + 20 * 6-bit integers + */ + printf("\t\t\tbyte_stream = _mm_load_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 6);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 6);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 6);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 6);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));\n"); + + printf("\t\t\tin += 16;\n"); // 16 bytes + printf("\t\t\tto += 20;\n"); // becomes 20 integers + } + else if (instance >> 4 == 7) + { + /* + 36 * 7 bit integers (in two 128-bit words) + */ + printf("\t\t\tbyte_stream = _mm_load_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 7);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 7);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 7);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));\n"); + + printf("\t\t\tbyte_stream_2 = _mm_load_si128((__m128i *)in + 1);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream_2, 3);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 7);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 7);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 7);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));\n"); + + printf("\t\t\tin += 32;\n"); // 32 bytes + printf("\t\t\tto += 36;\n"); // becomes 36 integers + } + else if (instance >> 4 == 8) + { + /* + 16 * 8-bit integers + */ + printf("\t\t\ttmp = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));\n"); + printf("\t\t\ttmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));\n"); + printf("\t\t\ttmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));\n"); + printf("\t\t\ttmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));\n"); + + printf("\t\t\tin += 16;\n"); // 16 bytes + printf("\t\t\tto += 16;\n"); // becomes 16 integers + } + else if (instance >> 4 == 9) + { + /* + 28 * 9-bit ingtegers (in two 128-bit words) + */ + printf("\t\t\tbyte_stream = _mm_load_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 9);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 9);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));\n"); + printf("\t\t\tbyte_stream_2 = _mm_load_si128((__m128i *)in + 1);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream_2, 4);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 9);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 9);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));\n"); + printf("\t\t\tin += 32;\n"); // 32 bytes + printf("\t\t\tto += 28;\n"); // becomes 28 integers + } + else if (instance >> 4 == 10) + { + /* + 12 * 10-bit integers + */ + printf("\t\t\tbyte_stream = _mm_load_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 10);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 10);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));\n"); + + printf("\t\t\tin += 16;\n"); // 16 bytes + printf("\t\t\tto += 12;\n"); // becomes 12 integers + } + else if (instance >> 4 == 11) + { + /* + 20 * 12-bit ingtegers (in two 128-bit words) + */ + printf("\t\t\tbyte_stream = _mm_load_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 12);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));\n"); + printf("\t\t\tbyte_stream_2 = _mm_load_si128((__m128i *)in + 1);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream_2, 8);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 12);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));\n"); + + printf("\t\t\tin += 32;\n"); // 32 bytes + printf("\t\t\tto += 20;\n"); // becomes 20 integers + } + else if (instance >> 4 == 12) + { + /* + 16-bit integers + */ + printf("\t\t\ttmp = _mm_load_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));\n"); + + printf("\t\t\tin += 16;\n"); // 16 bytes + printf("\t\t\tto += 8;\n"); // becomes 8 integers + } + else if (instance >> 4 == 13) + { + /* + 12 * 21-bit ingtegers (in two 128-bit words) + */ + printf("\t\t\tbyte_stream = _mm_load_si128((__m128i *)in);"); + printf("\t\t\t_mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));"); + printf("\t\t\tbyte_stream_2 = _mm_load_si128((__m128i *)in + 1);"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));"); + printf("\t\t\t_mm_store_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));"); + + printf("\t\t\tin += 32;\n"); // 32 bytes + printf("\t\t\tto += 12;\n"); // becomes 8 integers + } + else if (instance >> 4 == 14) + { + /* + 32-bit integers + */ + printf("\t\t\ttmp = _mm_load_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_store_si128((__m128i *)to, tmp);\n"); + + printf("\t\t\tin += 16;\n"); // 16 bytes + printf("\t\t\tto += 4;\n"); // becomes 4 integers + } + else + { + printf("\t\t\tin++;\n"); // dummy, can't occur + } + if ((instance & 0xF) == 0xF) + printf("\t\t\tbreak;\n"); // every 32 instances we break (its the end of the fall through) + } + printf("\t\t}\n"); + printf("\t}\n"); + printf("}\n"); + } +#endif + +#ifdef TEST_ONE_STRING + static uint32_t sequence[]={0x333,0xC7,0x21C,0x78F,0x66A,0x787,0xD0C,0xEE,0x416,0x2F8,0x410,0xFF3,0x7A7,0x35C,0x5A8,0x4ED,0x3AD,0x121,0x3A7,0x5EC,0x53,0x50C,0xFD6,0x697,0xF4,0x894,0xB5F,0x381,0x10C,0xB1E,0x2E4,0x32,0x7EB,0x1C6,0x1DB,0xE3,0x27,0x920,0x262,0x718,0x95,0x7C0,0x155,0x8F,0x83A,0x1178,0xCEF,0x7DC,0x3CB,0x30E,0x2EA,0x16F,0x212,0x4A,0x9F0,0x233,0x7,0x9F7,0x1EE,0x91,0x12FD,0x7C,0x291,0x203,0x2F8,0x39B,0x411,0x61C,0x3E2,0x1DF,0xCD7,0x5DA,0xD35,0x21,0x1C8D,0x25,0x313,0x314,0xBBB,0xFB,0x1E2,0x60,0x3F5,0x513,0x3AC,0x769,0x45E,0x485,0x1BA,0x17B,0x2DC,0x173,0x151,0x163E,0x101,0xE9D,0xB67,0x28B,0x4CA,0x955,0x6B3,0x112,0x225,0x742,0x432,0x453,0x3CF,0x541,0xCCE,0xDB6,0x406,0x58,0x202,0x647,0x9F,0x29,0x153,0x51E,0x233,0x7A3,0x731,0x3A,0xA0,0xD23,0x3C7,0xD1,0x5C,0xB90,0x22C,0xE8,0x78B,0x5E3}; + + static uint32_t second_compress_buffer[100000]; + static uint32_t second_decompress_buffer[100000]; + + uint32_t second_compress_buffer_size = sizeof(second_compress_buffer) / sizeof(*second_compress_buffer); + uint32_t second_decompress_buffer_size = sizeof(second_decompress_buffer) / sizeof(*second_decompress_buffer); + + /* + CHECK() + ------- + */ + void check(uint32_t *sequence, uint32_t sequence_length) + { + compress_qmx_ascd compressor; + uint64_t buffer_size; + uint32_t pos; + uint32_t fail; + + memset(second_compress_buffer, 0, second_compress_buffer_size); + memset(second_decompress_buffer, 0, second_decompress_buffer_size); + + compressor.encodeArray(sequence, sequence_length, (uint32_t *)second_compress_buffer, &buffer_size); + second_compress_buffer[buffer_size] = 0; + second_compress_buffer[buffer_size + 1] = 0; + second_compress_buffer[buffer_size + 2] = 0; + second_compress_buffer[buffer_size + 3] = 0; + + for (pos = 0; pos < buffer_size; pos++) + printf("%02X ", ((uint8_t *)second_compress_buffer)[pos]); + puts(""); + + compressor.decodeArray((uint32_t *)second_compress_buffer, buffer_size, (uint32_t *)second_decompress_buffer, sequence_length); + + fail = false; + for (pos = 0; pos < sequence_length; pos++) + if (sequence[pos] != second_decompress_buffer[pos]) + { + printf("p[%d]:%X != %X\n", pos, sequence[pos], second_decompress_buffer[pos]); + fail = true; + } + else + printf("p[%d]:%X == %X\n", pos, sequence[pos], second_decompress_buffer[pos]); + + if (fail) + puts("Test failed"); + else + puts("Test succeeded"); + } + + /* + MAIN() + ------ + */ + int main(void) + { + check(sequence, sizeof(sequence) / sizeof(*sequence)); + } +#endif +/* + COMPRESS_QMX_ADCS::DECODEARRAY() + --------------------------- + this code was generated by the method above. +*/ +static uint32_t ALIGN_16 static_mask_21[] = {0x1fffff, 0x1fffff, 0x1fffff, 0x1fffff}; +static uint32_t ALIGN_16 static_mask_12[] = {0xfff, 0xfff, 0xfff, 0xfff}; +static uint32_t ALIGN_16 static_mask_10[] = {0x3ff, 0x3ff, 0x3ff, 0x3ff}; +static uint32_t ALIGN_16 static_mask_9[] = {0x1ff, 0x1ff, 0x1ff, 0x1ff}; +static uint32_t ALIGN_16 static_mask_7[] = {0x7f, 0x7f, 0x7f, 0x7f}; +static uint32_t ALIGN_16 static_mask_6[] = {0x3f, 0x3f, 0x3f, 0x3f}; +static uint32_t ALIGN_16 static_mask_5[] = {0x1f, 0x1f, 0x1f, 0x1f}; +static uint32_t ALIGN_16 static_mask_4[] = {0x0f, 0x0f, 0x0f, 0x0f}; +static uint32_t ALIGN_16 static_mask_3[] = {0x07, 0x07, 0x07, 0x07}; +static uint32_t ALIGN_16 static_mask_2[] = {0x03, 0x03, 0x03, 0x03}; +static uint32_t ALIGN_16 static_mask_1[] = {0x01, 0x01, 0x01, 0x01}; +void ANT_compress_qmx_adcs::decodeArray(const uint32_t *source, uint64_t len, uint32_t *to, uint64_t destination_integers) +{ +__m128i byte_stream, byte_stream_2, tmp, tmp2, mask_21, mask_12, mask_10, mask_9, mask_7, mask_6, mask_5, mask_4, mask_3, mask_2, mask_1; +uint8_t *in = (uint8_t *)source; +uint32_t *end = to + destination_integers; +uint32_t key_start = vbyte_decompress((uint8_t *)source + len - 1); +uint8_t *keys = (uint8_t *)source + len - key_start; + +mask_21 = _mm_load_si128((__m128i *)static_mask_21); +mask_12 = _mm_load_si128((__m128i *)static_mask_12); +mask_10 = _mm_load_si128((__m128i *)static_mask_10); +mask_9 = _mm_load_si128((__m128i *)static_mask_9); +mask_7 = _mm_load_si128((__m128i *)static_mask_7); +mask_6 = _mm_load_si128((__m128i *)static_mask_6); +mask_5 = _mm_load_si128((__m128i *)static_mask_5); +mask_4 = _mm_load_si128((__m128i *)static_mask_4); +mask_3 = _mm_load_si128((__m128i *)static_mask_3); +mask_2 = _mm_load_si128((__m128i *)static_mask_2); +mask_1 = _mm_load_si128((__m128i *)static_mask_1); + +while (to < end) + { + switch (*keys++) + { + case 0x00: +#ifdef NO_ZEROS + tmp = _mm_load_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_store_si128((__m128i *)to, tmp); + _mm_store_si128((__m128i *)to + 1, tmp); + _mm_store_si128((__m128i *)to + 2, tmp); + _mm_store_si128((__m128i *)to + 3, tmp); + _mm_store_si128((__m128i *)to + 4, tmp); + _mm_store_si128((__m128i *)to + 5, tmp); + _mm_store_si128((__m128i *)to + 6, tmp); + _mm_store_si128((__m128i *)to + 7, tmp); + _mm_store_si128((__m128i *)to + 8, tmp); + _mm_store_si128((__m128i *)to + 9, tmp); + _mm_store_si128((__m128i *)to + 10, tmp); + _mm_store_si128((__m128i *)to + 11, tmp); + _mm_store_si128((__m128i *)to + 12, tmp); + _mm_store_si128((__m128i *)to + 13, tmp); + _mm_store_si128((__m128i *)to + 14, tmp); + _mm_store_si128((__m128i *)to + 15, tmp); + _mm_store_si128((__m128i *)to + 16, tmp); + _mm_store_si128((__m128i *)to + 17, tmp); + _mm_store_si128((__m128i *)to + 18, tmp); + _mm_store_si128((__m128i *)to + 19, tmp); + _mm_store_si128((__m128i *)to + 20, tmp); + _mm_store_si128((__m128i *)to + 21, tmp); + _mm_store_si128((__m128i *)to + 22, tmp); + _mm_store_si128((__m128i *)to + 23, tmp); + _mm_store_si128((__m128i *)to + 24, tmp); + _mm_store_si128((__m128i *)to + 25, tmp); + _mm_store_si128((__m128i *)to + 26, tmp); + _mm_store_si128((__m128i *)to + 27, tmp); + _mm_store_si128((__m128i *)to + 28, tmp); + _mm_store_si128((__m128i *)to + 29, tmp); + _mm_store_si128((__m128i *)to + 30, tmp); + _mm_store_si128((__m128i *)to + 31, tmp); + _mm_store_si128((__m128i *)to + 32, tmp); + _mm_store_si128((__m128i *)to + 33, tmp); + _mm_store_si128((__m128i *)to + 34, tmp); + _mm_store_si128((__m128i *)to + 35, tmp); + _mm_store_si128((__m128i *)to + 36, tmp); + _mm_store_si128((__m128i *)to + 37, tmp); + _mm_store_si128((__m128i *)to + 38, tmp); + _mm_store_si128((__m128i *)to + 39, tmp); + _mm_store_si128((__m128i *)to + 40, tmp); + _mm_store_si128((__m128i *)to + 41, tmp); + _mm_store_si128((__m128i *)to + 42, tmp); + _mm_store_si128((__m128i *)to + 43, tmp); + _mm_store_si128((__m128i *)to + 44, tmp); + _mm_store_si128((__m128i *)to + 45, tmp); + _mm_store_si128((__m128i *)to + 46, tmp); + _mm_store_si128((__m128i *)to + 47, tmp); + _mm_store_si128((__m128i *)to + 48, tmp); + _mm_store_si128((__m128i *)to + 49, tmp); + _mm_store_si128((__m128i *)to + 50, tmp); + _mm_store_si128((__m128i *)to + 51, tmp); + _mm_store_si128((__m128i *)to + 52, tmp); + _mm_store_si128((__m128i *)to + 53, tmp); + _mm_store_si128((__m128i *)to + 54, tmp); + _mm_store_si128((__m128i *)to + 55, tmp); + _mm_store_si128((__m128i *)to + 56, tmp); + _mm_store_si128((__m128i *)to + 57, tmp); + _mm_store_si128((__m128i *)to + 58, tmp); + _mm_store_si128((__m128i *)to + 59, tmp); + _mm_store_si128((__m128i *)to + 60, tmp); + _mm_store_si128((__m128i *)to + 61, tmp); + _mm_store_si128((__m128i *)to + 62, tmp); + _mm_store_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x01: +#ifdef NO_ZEROS + tmp = _mm_load_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_store_si128((__m128i *)to, tmp); + _mm_store_si128((__m128i *)to + 1, tmp); + _mm_store_si128((__m128i *)to + 2, tmp); + _mm_store_si128((__m128i *)to + 3, tmp); + _mm_store_si128((__m128i *)to + 4, tmp); + _mm_store_si128((__m128i *)to + 5, tmp); + _mm_store_si128((__m128i *)to + 6, tmp); + _mm_store_si128((__m128i *)to + 7, tmp); + _mm_store_si128((__m128i *)to + 8, tmp); + _mm_store_si128((__m128i *)to + 9, tmp); + _mm_store_si128((__m128i *)to + 10, tmp); + _mm_store_si128((__m128i *)to + 11, tmp); + _mm_store_si128((__m128i *)to + 12, tmp); + _mm_store_si128((__m128i *)to + 13, tmp); + _mm_store_si128((__m128i *)to + 14, tmp); + _mm_store_si128((__m128i *)to + 15, tmp); + _mm_store_si128((__m128i *)to + 16, tmp); + _mm_store_si128((__m128i *)to + 17, tmp); + _mm_store_si128((__m128i *)to + 18, tmp); + _mm_store_si128((__m128i *)to + 19, tmp); + _mm_store_si128((__m128i *)to + 20, tmp); + _mm_store_si128((__m128i *)to + 21, tmp); + _mm_store_si128((__m128i *)to + 22, tmp); + _mm_store_si128((__m128i *)to + 23, tmp); + _mm_store_si128((__m128i *)to + 24, tmp); + _mm_store_si128((__m128i *)to + 25, tmp); + _mm_store_si128((__m128i *)to + 26, tmp); + _mm_store_si128((__m128i *)to + 27, tmp); + _mm_store_si128((__m128i *)to + 28, tmp); + _mm_store_si128((__m128i *)to + 29, tmp); + _mm_store_si128((__m128i *)to + 30, tmp); + _mm_store_si128((__m128i *)to + 31, tmp); + _mm_store_si128((__m128i *)to + 32, tmp); + _mm_store_si128((__m128i *)to + 33, tmp); + _mm_store_si128((__m128i *)to + 34, tmp); + _mm_store_si128((__m128i *)to + 35, tmp); + _mm_store_si128((__m128i *)to + 36, tmp); + _mm_store_si128((__m128i *)to + 37, tmp); + _mm_store_si128((__m128i *)to + 38, tmp); + _mm_store_si128((__m128i *)to + 39, tmp); + _mm_store_si128((__m128i *)to + 40, tmp); + _mm_store_si128((__m128i *)to + 41, tmp); + _mm_store_si128((__m128i *)to + 42, tmp); + _mm_store_si128((__m128i *)to + 43, tmp); + _mm_store_si128((__m128i *)to + 44, tmp); + _mm_store_si128((__m128i *)to + 45, tmp); + _mm_store_si128((__m128i *)to + 46, tmp); + _mm_store_si128((__m128i *)to + 47, tmp); + _mm_store_si128((__m128i *)to + 48, tmp); + _mm_store_si128((__m128i *)to + 49, tmp); + _mm_store_si128((__m128i *)to + 50, tmp); + _mm_store_si128((__m128i *)to + 51, tmp); + _mm_store_si128((__m128i *)to + 52, tmp); + _mm_store_si128((__m128i *)to + 53, tmp); + _mm_store_si128((__m128i *)to + 54, tmp); + _mm_store_si128((__m128i *)to + 55, tmp); + _mm_store_si128((__m128i *)to + 56, tmp); + _mm_store_si128((__m128i *)to + 57, tmp); + _mm_store_si128((__m128i *)to + 58, tmp); + _mm_store_si128((__m128i *)to + 59, tmp); + _mm_store_si128((__m128i *)to + 60, tmp); + _mm_store_si128((__m128i *)to + 61, tmp); + _mm_store_si128((__m128i *)to + 62, tmp); + _mm_store_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x02: +#ifdef NO_ZEROS + tmp = _mm_load_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_store_si128((__m128i *)to, tmp); + _mm_store_si128((__m128i *)to + 1, tmp); + _mm_store_si128((__m128i *)to + 2, tmp); + _mm_store_si128((__m128i *)to + 3, tmp); + _mm_store_si128((__m128i *)to + 4, tmp); + _mm_store_si128((__m128i *)to + 5, tmp); + _mm_store_si128((__m128i *)to + 6, tmp); + _mm_store_si128((__m128i *)to + 7, tmp); + _mm_store_si128((__m128i *)to + 8, tmp); + _mm_store_si128((__m128i *)to + 9, tmp); + _mm_store_si128((__m128i *)to + 10, tmp); + _mm_store_si128((__m128i *)to + 11, tmp); + _mm_store_si128((__m128i *)to + 12, tmp); + _mm_store_si128((__m128i *)to + 13, tmp); + _mm_store_si128((__m128i *)to + 14, tmp); + _mm_store_si128((__m128i *)to + 15, tmp); + _mm_store_si128((__m128i *)to + 16, tmp); + _mm_store_si128((__m128i *)to + 17, tmp); + _mm_store_si128((__m128i *)to + 18, tmp); + _mm_store_si128((__m128i *)to + 19, tmp); + _mm_store_si128((__m128i *)to + 20, tmp); + _mm_store_si128((__m128i *)to + 21, tmp); + _mm_store_si128((__m128i *)to + 22, tmp); + _mm_store_si128((__m128i *)to + 23, tmp); + _mm_store_si128((__m128i *)to + 24, tmp); + _mm_store_si128((__m128i *)to + 25, tmp); + _mm_store_si128((__m128i *)to + 26, tmp); + _mm_store_si128((__m128i *)to + 27, tmp); + _mm_store_si128((__m128i *)to + 28, tmp); + _mm_store_si128((__m128i *)to + 29, tmp); + _mm_store_si128((__m128i *)to + 30, tmp); + _mm_store_si128((__m128i *)to + 31, tmp); + _mm_store_si128((__m128i *)to + 32, tmp); + _mm_store_si128((__m128i *)to + 33, tmp); + _mm_store_si128((__m128i *)to + 34, tmp); + _mm_store_si128((__m128i *)to + 35, tmp); + _mm_store_si128((__m128i *)to + 36, tmp); + _mm_store_si128((__m128i *)to + 37, tmp); + _mm_store_si128((__m128i *)to + 38, tmp); + _mm_store_si128((__m128i *)to + 39, tmp); + _mm_store_si128((__m128i *)to + 40, tmp); + _mm_store_si128((__m128i *)to + 41, tmp); + _mm_store_si128((__m128i *)to + 42, tmp); + _mm_store_si128((__m128i *)to + 43, tmp); + _mm_store_si128((__m128i *)to + 44, tmp); + _mm_store_si128((__m128i *)to + 45, tmp); + _mm_store_si128((__m128i *)to + 46, tmp); + _mm_store_si128((__m128i *)to + 47, tmp); + _mm_store_si128((__m128i *)to + 48, tmp); + _mm_store_si128((__m128i *)to + 49, tmp); + _mm_store_si128((__m128i *)to + 50, tmp); + _mm_store_si128((__m128i *)to + 51, tmp); + _mm_store_si128((__m128i *)to + 52, tmp); + _mm_store_si128((__m128i *)to + 53, tmp); + _mm_store_si128((__m128i *)to + 54, tmp); + _mm_store_si128((__m128i *)to + 55, tmp); + _mm_store_si128((__m128i *)to + 56, tmp); + _mm_store_si128((__m128i *)to + 57, tmp); + _mm_store_si128((__m128i *)to + 58, tmp); + _mm_store_si128((__m128i *)to + 59, tmp); + _mm_store_si128((__m128i *)to + 60, tmp); + _mm_store_si128((__m128i *)to + 61, tmp); + _mm_store_si128((__m128i *)to + 62, tmp); + _mm_store_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x03: +#ifdef NO_ZEROS + tmp = _mm_load_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_store_si128((__m128i *)to, tmp); + _mm_store_si128((__m128i *)to + 1, tmp); + _mm_store_si128((__m128i *)to + 2, tmp); + _mm_store_si128((__m128i *)to + 3, tmp); + _mm_store_si128((__m128i *)to + 4, tmp); + _mm_store_si128((__m128i *)to + 5, tmp); + _mm_store_si128((__m128i *)to + 6, tmp); + _mm_store_si128((__m128i *)to + 7, tmp); + _mm_store_si128((__m128i *)to + 8, tmp); + _mm_store_si128((__m128i *)to + 9, tmp); + _mm_store_si128((__m128i *)to + 10, tmp); + _mm_store_si128((__m128i *)to + 11, tmp); + _mm_store_si128((__m128i *)to + 12, tmp); + _mm_store_si128((__m128i *)to + 13, tmp); + _mm_store_si128((__m128i *)to + 14, tmp); + _mm_store_si128((__m128i *)to + 15, tmp); + _mm_store_si128((__m128i *)to + 16, tmp); + _mm_store_si128((__m128i *)to + 17, tmp); + _mm_store_si128((__m128i *)to + 18, tmp); + _mm_store_si128((__m128i *)to + 19, tmp); + _mm_store_si128((__m128i *)to + 20, tmp); + _mm_store_si128((__m128i *)to + 21, tmp); + _mm_store_si128((__m128i *)to + 22, tmp); + _mm_store_si128((__m128i *)to + 23, tmp); + _mm_store_si128((__m128i *)to + 24, tmp); + _mm_store_si128((__m128i *)to + 25, tmp); + _mm_store_si128((__m128i *)to + 26, tmp); + _mm_store_si128((__m128i *)to + 27, tmp); + _mm_store_si128((__m128i *)to + 28, tmp); + _mm_store_si128((__m128i *)to + 29, tmp); + _mm_store_si128((__m128i *)to + 30, tmp); + _mm_store_si128((__m128i *)to + 31, tmp); + _mm_store_si128((__m128i *)to + 32, tmp); + _mm_store_si128((__m128i *)to + 33, tmp); + _mm_store_si128((__m128i *)to + 34, tmp); + _mm_store_si128((__m128i *)to + 35, tmp); + _mm_store_si128((__m128i *)to + 36, tmp); + _mm_store_si128((__m128i *)to + 37, tmp); + _mm_store_si128((__m128i *)to + 38, tmp); + _mm_store_si128((__m128i *)to + 39, tmp); + _mm_store_si128((__m128i *)to + 40, tmp); + _mm_store_si128((__m128i *)to + 41, tmp); + _mm_store_si128((__m128i *)to + 42, tmp); + _mm_store_si128((__m128i *)to + 43, tmp); + _mm_store_si128((__m128i *)to + 44, tmp); + _mm_store_si128((__m128i *)to + 45, tmp); + _mm_store_si128((__m128i *)to + 46, tmp); + _mm_store_si128((__m128i *)to + 47, tmp); + _mm_store_si128((__m128i *)to + 48, tmp); + _mm_store_si128((__m128i *)to + 49, tmp); + _mm_store_si128((__m128i *)to + 50, tmp); + _mm_store_si128((__m128i *)to + 51, tmp); + _mm_store_si128((__m128i *)to + 52, tmp); + _mm_store_si128((__m128i *)to + 53, tmp); + _mm_store_si128((__m128i *)to + 54, tmp); + _mm_store_si128((__m128i *)to + 55, tmp); + _mm_store_si128((__m128i *)to + 56, tmp); + _mm_store_si128((__m128i *)to + 57, tmp); + _mm_store_si128((__m128i *)to + 58, tmp); + _mm_store_si128((__m128i *)to + 59, tmp); + _mm_store_si128((__m128i *)to + 60, tmp); + _mm_store_si128((__m128i *)to + 61, tmp); + _mm_store_si128((__m128i *)to + 62, tmp); + _mm_store_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x04: +#ifdef NO_ZEROS + tmp = _mm_load_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_store_si128((__m128i *)to, tmp); + _mm_store_si128((__m128i *)to + 1, tmp); + _mm_store_si128((__m128i *)to + 2, tmp); + _mm_store_si128((__m128i *)to + 3, tmp); + _mm_store_si128((__m128i *)to + 4, tmp); + _mm_store_si128((__m128i *)to + 5, tmp); + _mm_store_si128((__m128i *)to + 6, tmp); + _mm_store_si128((__m128i *)to + 7, tmp); + _mm_store_si128((__m128i *)to + 8, tmp); + _mm_store_si128((__m128i *)to + 9, tmp); + _mm_store_si128((__m128i *)to + 10, tmp); + _mm_store_si128((__m128i *)to + 11, tmp); + _mm_store_si128((__m128i *)to + 12, tmp); + _mm_store_si128((__m128i *)to + 13, tmp); + _mm_store_si128((__m128i *)to + 14, tmp); + _mm_store_si128((__m128i *)to + 15, tmp); + _mm_store_si128((__m128i *)to + 16, tmp); + _mm_store_si128((__m128i *)to + 17, tmp); + _mm_store_si128((__m128i *)to + 18, tmp); + _mm_store_si128((__m128i *)to + 19, tmp); + _mm_store_si128((__m128i *)to + 20, tmp); + _mm_store_si128((__m128i *)to + 21, tmp); + _mm_store_si128((__m128i *)to + 22, tmp); + _mm_store_si128((__m128i *)to + 23, tmp); + _mm_store_si128((__m128i *)to + 24, tmp); + _mm_store_si128((__m128i *)to + 25, tmp); + _mm_store_si128((__m128i *)to + 26, tmp); + _mm_store_si128((__m128i *)to + 27, tmp); + _mm_store_si128((__m128i *)to + 28, tmp); + _mm_store_si128((__m128i *)to + 29, tmp); + _mm_store_si128((__m128i *)to + 30, tmp); + _mm_store_si128((__m128i *)to + 31, tmp); + _mm_store_si128((__m128i *)to + 32, tmp); + _mm_store_si128((__m128i *)to + 33, tmp); + _mm_store_si128((__m128i *)to + 34, tmp); + _mm_store_si128((__m128i *)to + 35, tmp); + _mm_store_si128((__m128i *)to + 36, tmp); + _mm_store_si128((__m128i *)to + 37, tmp); + _mm_store_si128((__m128i *)to + 38, tmp); + _mm_store_si128((__m128i *)to + 39, tmp); + _mm_store_si128((__m128i *)to + 40, tmp); + _mm_store_si128((__m128i *)to + 41, tmp); + _mm_store_si128((__m128i *)to + 42, tmp); + _mm_store_si128((__m128i *)to + 43, tmp); + _mm_store_si128((__m128i *)to + 44, tmp); + _mm_store_si128((__m128i *)to + 45, tmp); + _mm_store_si128((__m128i *)to + 46, tmp); + _mm_store_si128((__m128i *)to + 47, tmp); + _mm_store_si128((__m128i *)to + 48, tmp); + _mm_store_si128((__m128i *)to + 49, tmp); + _mm_store_si128((__m128i *)to + 50, tmp); + _mm_store_si128((__m128i *)to + 51, tmp); + _mm_store_si128((__m128i *)to + 52, tmp); + _mm_store_si128((__m128i *)to + 53, tmp); + _mm_store_si128((__m128i *)to + 54, tmp); + _mm_store_si128((__m128i *)to + 55, tmp); + _mm_store_si128((__m128i *)to + 56, tmp); + _mm_store_si128((__m128i *)to + 57, tmp); + _mm_store_si128((__m128i *)to + 58, tmp); + _mm_store_si128((__m128i *)to + 59, tmp); + _mm_store_si128((__m128i *)to + 60, tmp); + _mm_store_si128((__m128i *)to + 61, tmp); + _mm_store_si128((__m128i *)to + 62, tmp); + _mm_store_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x05: +#ifdef NO_ZEROS + tmp = _mm_load_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_store_si128((__m128i *)to, tmp); + _mm_store_si128((__m128i *)to + 1, tmp); + _mm_store_si128((__m128i *)to + 2, tmp); + _mm_store_si128((__m128i *)to + 3, tmp); + _mm_store_si128((__m128i *)to + 4, tmp); + _mm_store_si128((__m128i *)to + 5, tmp); + _mm_store_si128((__m128i *)to + 6, tmp); + _mm_store_si128((__m128i *)to + 7, tmp); + _mm_store_si128((__m128i *)to + 8, tmp); + _mm_store_si128((__m128i *)to + 9, tmp); + _mm_store_si128((__m128i *)to + 10, tmp); + _mm_store_si128((__m128i *)to + 11, tmp); + _mm_store_si128((__m128i *)to + 12, tmp); + _mm_store_si128((__m128i *)to + 13, tmp); + _mm_store_si128((__m128i *)to + 14, tmp); + _mm_store_si128((__m128i *)to + 15, tmp); + _mm_store_si128((__m128i *)to + 16, tmp); + _mm_store_si128((__m128i *)to + 17, tmp); + _mm_store_si128((__m128i *)to + 18, tmp); + _mm_store_si128((__m128i *)to + 19, tmp); + _mm_store_si128((__m128i *)to + 20, tmp); + _mm_store_si128((__m128i *)to + 21, tmp); + _mm_store_si128((__m128i *)to + 22, tmp); + _mm_store_si128((__m128i *)to + 23, tmp); + _mm_store_si128((__m128i *)to + 24, tmp); + _mm_store_si128((__m128i *)to + 25, tmp); + _mm_store_si128((__m128i *)to + 26, tmp); + _mm_store_si128((__m128i *)to + 27, tmp); + _mm_store_si128((__m128i *)to + 28, tmp); + _mm_store_si128((__m128i *)to + 29, tmp); + _mm_store_si128((__m128i *)to + 30, tmp); + _mm_store_si128((__m128i *)to + 31, tmp); + _mm_store_si128((__m128i *)to + 32, tmp); + _mm_store_si128((__m128i *)to + 33, tmp); + _mm_store_si128((__m128i *)to + 34, tmp); + _mm_store_si128((__m128i *)to + 35, tmp); + _mm_store_si128((__m128i *)to + 36, tmp); + _mm_store_si128((__m128i *)to + 37, tmp); + _mm_store_si128((__m128i *)to + 38, tmp); + _mm_store_si128((__m128i *)to + 39, tmp); + _mm_store_si128((__m128i *)to + 40, tmp); + _mm_store_si128((__m128i *)to + 41, tmp); + _mm_store_si128((__m128i *)to + 42, tmp); + _mm_store_si128((__m128i *)to + 43, tmp); + _mm_store_si128((__m128i *)to + 44, tmp); + _mm_store_si128((__m128i *)to + 45, tmp); + _mm_store_si128((__m128i *)to + 46, tmp); + _mm_store_si128((__m128i *)to + 47, tmp); + _mm_store_si128((__m128i *)to + 48, tmp); + _mm_store_si128((__m128i *)to + 49, tmp); + _mm_store_si128((__m128i *)to + 50, tmp); + _mm_store_si128((__m128i *)to + 51, tmp); + _mm_store_si128((__m128i *)to + 52, tmp); + _mm_store_si128((__m128i *)to + 53, tmp); + _mm_store_si128((__m128i *)to + 54, tmp); + _mm_store_si128((__m128i *)to + 55, tmp); + _mm_store_si128((__m128i *)to + 56, tmp); + _mm_store_si128((__m128i *)to + 57, tmp); + _mm_store_si128((__m128i *)to + 58, tmp); + _mm_store_si128((__m128i *)to + 59, tmp); + _mm_store_si128((__m128i *)to + 60, tmp); + _mm_store_si128((__m128i *)to + 61, tmp); + _mm_store_si128((__m128i *)to + 62, tmp); + _mm_store_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x06: +#ifdef NO_ZEROS + tmp = _mm_load_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_store_si128((__m128i *)to, tmp); + _mm_store_si128((__m128i *)to + 1, tmp); + _mm_store_si128((__m128i *)to + 2, tmp); + _mm_store_si128((__m128i *)to + 3, tmp); + _mm_store_si128((__m128i *)to + 4, tmp); + _mm_store_si128((__m128i *)to + 5, tmp); + _mm_store_si128((__m128i *)to + 6, tmp); + _mm_store_si128((__m128i *)to + 7, tmp); + _mm_store_si128((__m128i *)to + 8, tmp); + _mm_store_si128((__m128i *)to + 9, tmp); + _mm_store_si128((__m128i *)to + 10, tmp); + _mm_store_si128((__m128i *)to + 11, tmp); + _mm_store_si128((__m128i *)to + 12, tmp); + _mm_store_si128((__m128i *)to + 13, tmp); + _mm_store_si128((__m128i *)to + 14, tmp); + _mm_store_si128((__m128i *)to + 15, tmp); + _mm_store_si128((__m128i *)to + 16, tmp); + _mm_store_si128((__m128i *)to + 17, tmp); + _mm_store_si128((__m128i *)to + 18, tmp); + _mm_store_si128((__m128i *)to + 19, tmp); + _mm_store_si128((__m128i *)to + 20, tmp); + _mm_store_si128((__m128i *)to + 21, tmp); + _mm_store_si128((__m128i *)to + 22, tmp); + _mm_store_si128((__m128i *)to + 23, tmp); + _mm_store_si128((__m128i *)to + 24, tmp); + _mm_store_si128((__m128i *)to + 25, tmp); + _mm_store_si128((__m128i *)to + 26, tmp); + _mm_store_si128((__m128i *)to + 27, tmp); + _mm_store_si128((__m128i *)to + 28, tmp); + _mm_store_si128((__m128i *)to + 29, tmp); + _mm_store_si128((__m128i *)to + 30, tmp); + _mm_store_si128((__m128i *)to + 31, tmp); + _mm_store_si128((__m128i *)to + 32, tmp); + _mm_store_si128((__m128i *)to + 33, tmp); + _mm_store_si128((__m128i *)to + 34, tmp); + _mm_store_si128((__m128i *)to + 35, tmp); + _mm_store_si128((__m128i *)to + 36, tmp); + _mm_store_si128((__m128i *)to + 37, tmp); + _mm_store_si128((__m128i *)to + 38, tmp); + _mm_store_si128((__m128i *)to + 39, tmp); + _mm_store_si128((__m128i *)to + 40, tmp); + _mm_store_si128((__m128i *)to + 41, tmp); + _mm_store_si128((__m128i *)to + 42, tmp); + _mm_store_si128((__m128i *)to + 43, tmp); + _mm_store_si128((__m128i *)to + 44, tmp); + _mm_store_si128((__m128i *)to + 45, tmp); + _mm_store_si128((__m128i *)to + 46, tmp); + _mm_store_si128((__m128i *)to + 47, tmp); + _mm_store_si128((__m128i *)to + 48, tmp); + _mm_store_si128((__m128i *)to + 49, tmp); + _mm_store_si128((__m128i *)to + 50, tmp); + _mm_store_si128((__m128i *)to + 51, tmp); + _mm_store_si128((__m128i *)to + 52, tmp); + _mm_store_si128((__m128i *)to + 53, tmp); + _mm_store_si128((__m128i *)to + 54, tmp); + _mm_store_si128((__m128i *)to + 55, tmp); + _mm_store_si128((__m128i *)to + 56, tmp); + _mm_store_si128((__m128i *)to + 57, tmp); + _mm_store_si128((__m128i *)to + 58, tmp); + _mm_store_si128((__m128i *)to + 59, tmp); + _mm_store_si128((__m128i *)to + 60, tmp); + _mm_store_si128((__m128i *)to + 61, tmp); + _mm_store_si128((__m128i *)to + 62, tmp); + _mm_store_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x07: +#ifdef NO_ZEROS + tmp = _mm_load_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_store_si128((__m128i *)to, tmp); + _mm_store_si128((__m128i *)to + 1, tmp); + _mm_store_si128((__m128i *)to + 2, tmp); + _mm_store_si128((__m128i *)to + 3, tmp); + _mm_store_si128((__m128i *)to + 4, tmp); + _mm_store_si128((__m128i *)to + 5, tmp); + _mm_store_si128((__m128i *)to + 6, tmp); + _mm_store_si128((__m128i *)to + 7, tmp); + _mm_store_si128((__m128i *)to + 8, tmp); + _mm_store_si128((__m128i *)to + 9, tmp); + _mm_store_si128((__m128i *)to + 10, tmp); + _mm_store_si128((__m128i *)to + 11, tmp); + _mm_store_si128((__m128i *)to + 12, tmp); + _mm_store_si128((__m128i *)to + 13, tmp); + _mm_store_si128((__m128i *)to + 14, tmp); + _mm_store_si128((__m128i *)to + 15, tmp); + _mm_store_si128((__m128i *)to + 16, tmp); + _mm_store_si128((__m128i *)to + 17, tmp); + _mm_store_si128((__m128i *)to + 18, tmp); + _mm_store_si128((__m128i *)to + 19, tmp); + _mm_store_si128((__m128i *)to + 20, tmp); + _mm_store_si128((__m128i *)to + 21, tmp); + _mm_store_si128((__m128i *)to + 22, tmp); + _mm_store_si128((__m128i *)to + 23, tmp); + _mm_store_si128((__m128i *)to + 24, tmp); + _mm_store_si128((__m128i *)to + 25, tmp); + _mm_store_si128((__m128i *)to + 26, tmp); + _mm_store_si128((__m128i *)to + 27, tmp); + _mm_store_si128((__m128i *)to + 28, tmp); + _mm_store_si128((__m128i *)to + 29, tmp); + _mm_store_si128((__m128i *)to + 30, tmp); + _mm_store_si128((__m128i *)to + 31, tmp); + _mm_store_si128((__m128i *)to + 32, tmp); + _mm_store_si128((__m128i *)to + 33, tmp); + _mm_store_si128((__m128i *)to + 34, tmp); + _mm_store_si128((__m128i *)to + 35, tmp); + _mm_store_si128((__m128i *)to + 36, tmp); + _mm_store_si128((__m128i *)to + 37, tmp); + _mm_store_si128((__m128i *)to + 38, tmp); + _mm_store_si128((__m128i *)to + 39, tmp); + _mm_store_si128((__m128i *)to + 40, tmp); + _mm_store_si128((__m128i *)to + 41, tmp); + _mm_store_si128((__m128i *)to + 42, tmp); + _mm_store_si128((__m128i *)to + 43, tmp); + _mm_store_si128((__m128i *)to + 44, tmp); + _mm_store_si128((__m128i *)to + 45, tmp); + _mm_store_si128((__m128i *)to + 46, tmp); + _mm_store_si128((__m128i *)to + 47, tmp); + _mm_store_si128((__m128i *)to + 48, tmp); + _mm_store_si128((__m128i *)to + 49, tmp); + _mm_store_si128((__m128i *)to + 50, tmp); + _mm_store_si128((__m128i *)to + 51, tmp); + _mm_store_si128((__m128i *)to + 52, tmp); + _mm_store_si128((__m128i *)to + 53, tmp); + _mm_store_si128((__m128i *)to + 54, tmp); + _mm_store_si128((__m128i *)to + 55, tmp); + _mm_store_si128((__m128i *)to + 56, tmp); + _mm_store_si128((__m128i *)to + 57, tmp); + _mm_store_si128((__m128i *)to + 58, tmp); + _mm_store_si128((__m128i *)to + 59, tmp); + _mm_store_si128((__m128i *)to + 60, tmp); + _mm_store_si128((__m128i *)to + 61, tmp); + _mm_store_si128((__m128i *)to + 62, tmp); + _mm_store_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x08: +#ifdef NO_ZEROS + tmp = _mm_load_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_store_si128((__m128i *)to, tmp); + _mm_store_si128((__m128i *)to + 1, tmp); + _mm_store_si128((__m128i *)to + 2, tmp); + _mm_store_si128((__m128i *)to + 3, tmp); + _mm_store_si128((__m128i *)to + 4, tmp); + _mm_store_si128((__m128i *)to + 5, tmp); + _mm_store_si128((__m128i *)to + 6, tmp); + _mm_store_si128((__m128i *)to + 7, tmp); + _mm_store_si128((__m128i *)to + 8, tmp); + _mm_store_si128((__m128i *)to + 9, tmp); + _mm_store_si128((__m128i *)to + 10, tmp); + _mm_store_si128((__m128i *)to + 11, tmp); + _mm_store_si128((__m128i *)to + 12, tmp); + _mm_store_si128((__m128i *)to + 13, tmp); + _mm_store_si128((__m128i *)to + 14, tmp); + _mm_store_si128((__m128i *)to + 15, tmp); + _mm_store_si128((__m128i *)to + 16, tmp); + _mm_store_si128((__m128i *)to + 17, tmp); + _mm_store_si128((__m128i *)to + 18, tmp); + _mm_store_si128((__m128i *)to + 19, tmp); + _mm_store_si128((__m128i *)to + 20, tmp); + _mm_store_si128((__m128i *)to + 21, tmp); + _mm_store_si128((__m128i *)to + 22, tmp); + _mm_store_si128((__m128i *)to + 23, tmp); + _mm_store_si128((__m128i *)to + 24, tmp); + _mm_store_si128((__m128i *)to + 25, tmp); + _mm_store_si128((__m128i *)to + 26, tmp); + _mm_store_si128((__m128i *)to + 27, tmp); + _mm_store_si128((__m128i *)to + 28, tmp); + _mm_store_si128((__m128i *)to + 29, tmp); + _mm_store_si128((__m128i *)to + 30, tmp); + _mm_store_si128((__m128i *)to + 31, tmp); + _mm_store_si128((__m128i *)to + 32, tmp); + _mm_store_si128((__m128i *)to + 33, tmp); + _mm_store_si128((__m128i *)to + 34, tmp); + _mm_store_si128((__m128i *)to + 35, tmp); + _mm_store_si128((__m128i *)to + 36, tmp); + _mm_store_si128((__m128i *)to + 37, tmp); + _mm_store_si128((__m128i *)to + 38, tmp); + _mm_store_si128((__m128i *)to + 39, tmp); + _mm_store_si128((__m128i *)to + 40, tmp); + _mm_store_si128((__m128i *)to + 41, tmp); + _mm_store_si128((__m128i *)to + 42, tmp); + _mm_store_si128((__m128i *)to + 43, tmp); + _mm_store_si128((__m128i *)to + 44, tmp); + _mm_store_si128((__m128i *)to + 45, tmp); + _mm_store_si128((__m128i *)to + 46, tmp); + _mm_store_si128((__m128i *)to + 47, tmp); + _mm_store_si128((__m128i *)to + 48, tmp); + _mm_store_si128((__m128i *)to + 49, tmp); + _mm_store_si128((__m128i *)to + 50, tmp); + _mm_store_si128((__m128i *)to + 51, tmp); + _mm_store_si128((__m128i *)to + 52, tmp); + _mm_store_si128((__m128i *)to + 53, tmp); + _mm_store_si128((__m128i *)to + 54, tmp); + _mm_store_si128((__m128i *)to + 55, tmp); + _mm_store_si128((__m128i *)to + 56, tmp); + _mm_store_si128((__m128i *)to + 57, tmp); + _mm_store_si128((__m128i *)to + 58, tmp); + _mm_store_si128((__m128i *)to + 59, tmp); + _mm_store_si128((__m128i *)to + 60, tmp); + _mm_store_si128((__m128i *)to + 61, tmp); + _mm_store_si128((__m128i *)to + 62, tmp); + _mm_store_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x09: +#ifdef NO_ZEROS + tmp = _mm_load_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_store_si128((__m128i *)to, tmp); + _mm_store_si128((__m128i *)to + 1, tmp); + _mm_store_si128((__m128i *)to + 2, tmp); + _mm_store_si128((__m128i *)to + 3, tmp); + _mm_store_si128((__m128i *)to + 4, tmp); + _mm_store_si128((__m128i *)to + 5, tmp); + _mm_store_si128((__m128i *)to + 6, tmp); + _mm_store_si128((__m128i *)to + 7, tmp); + _mm_store_si128((__m128i *)to + 8, tmp); + _mm_store_si128((__m128i *)to + 9, tmp); + _mm_store_si128((__m128i *)to + 10, tmp); + _mm_store_si128((__m128i *)to + 11, tmp); + _mm_store_si128((__m128i *)to + 12, tmp); + _mm_store_si128((__m128i *)to + 13, tmp); + _mm_store_si128((__m128i *)to + 14, tmp); + _mm_store_si128((__m128i *)to + 15, tmp); + _mm_store_si128((__m128i *)to + 16, tmp); + _mm_store_si128((__m128i *)to + 17, tmp); + _mm_store_si128((__m128i *)to + 18, tmp); + _mm_store_si128((__m128i *)to + 19, tmp); + _mm_store_si128((__m128i *)to + 20, tmp); + _mm_store_si128((__m128i *)to + 21, tmp); + _mm_store_si128((__m128i *)to + 22, tmp); + _mm_store_si128((__m128i *)to + 23, tmp); + _mm_store_si128((__m128i *)to + 24, tmp); + _mm_store_si128((__m128i *)to + 25, tmp); + _mm_store_si128((__m128i *)to + 26, tmp); + _mm_store_si128((__m128i *)to + 27, tmp); + _mm_store_si128((__m128i *)to + 28, tmp); + _mm_store_si128((__m128i *)to + 29, tmp); + _mm_store_si128((__m128i *)to + 30, tmp); + _mm_store_si128((__m128i *)to + 31, tmp); + _mm_store_si128((__m128i *)to + 32, tmp); + _mm_store_si128((__m128i *)to + 33, tmp); + _mm_store_si128((__m128i *)to + 34, tmp); + _mm_store_si128((__m128i *)to + 35, tmp); + _mm_store_si128((__m128i *)to + 36, tmp); + _mm_store_si128((__m128i *)to + 37, tmp); + _mm_store_si128((__m128i *)to + 38, tmp); + _mm_store_si128((__m128i *)to + 39, tmp); + _mm_store_si128((__m128i *)to + 40, tmp); + _mm_store_si128((__m128i *)to + 41, tmp); + _mm_store_si128((__m128i *)to + 42, tmp); + _mm_store_si128((__m128i *)to + 43, tmp); + _mm_store_si128((__m128i *)to + 44, tmp); + _mm_store_si128((__m128i *)to + 45, tmp); + _mm_store_si128((__m128i *)to + 46, tmp); + _mm_store_si128((__m128i *)to + 47, tmp); + _mm_store_si128((__m128i *)to + 48, tmp); + _mm_store_si128((__m128i *)to + 49, tmp); + _mm_store_si128((__m128i *)to + 50, tmp); + _mm_store_si128((__m128i *)to + 51, tmp); + _mm_store_si128((__m128i *)to + 52, tmp); + _mm_store_si128((__m128i *)to + 53, tmp); + _mm_store_si128((__m128i *)to + 54, tmp); + _mm_store_si128((__m128i *)to + 55, tmp); + _mm_store_si128((__m128i *)to + 56, tmp); + _mm_store_si128((__m128i *)to + 57, tmp); + _mm_store_si128((__m128i *)to + 58, tmp); + _mm_store_si128((__m128i *)to + 59, tmp); + _mm_store_si128((__m128i *)to + 60, tmp); + _mm_store_si128((__m128i *)to + 61, tmp); + _mm_store_si128((__m128i *)to + 62, tmp); + _mm_store_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x0a: +#ifdef NO_ZEROS + tmp = _mm_load_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_store_si128((__m128i *)to, tmp); + _mm_store_si128((__m128i *)to + 1, tmp); + _mm_store_si128((__m128i *)to + 2, tmp); + _mm_store_si128((__m128i *)to + 3, tmp); + _mm_store_si128((__m128i *)to + 4, tmp); + _mm_store_si128((__m128i *)to + 5, tmp); + _mm_store_si128((__m128i *)to + 6, tmp); + _mm_store_si128((__m128i *)to + 7, tmp); + _mm_store_si128((__m128i *)to + 8, tmp); + _mm_store_si128((__m128i *)to + 9, tmp); + _mm_store_si128((__m128i *)to + 10, tmp); + _mm_store_si128((__m128i *)to + 11, tmp); + _mm_store_si128((__m128i *)to + 12, tmp); + _mm_store_si128((__m128i *)to + 13, tmp); + _mm_store_si128((__m128i *)to + 14, tmp); + _mm_store_si128((__m128i *)to + 15, tmp); + _mm_store_si128((__m128i *)to + 16, tmp); + _mm_store_si128((__m128i *)to + 17, tmp); + _mm_store_si128((__m128i *)to + 18, tmp); + _mm_store_si128((__m128i *)to + 19, tmp); + _mm_store_si128((__m128i *)to + 20, tmp); + _mm_store_si128((__m128i *)to + 21, tmp); + _mm_store_si128((__m128i *)to + 22, tmp); + _mm_store_si128((__m128i *)to + 23, tmp); + _mm_store_si128((__m128i *)to + 24, tmp); + _mm_store_si128((__m128i *)to + 25, tmp); + _mm_store_si128((__m128i *)to + 26, tmp); + _mm_store_si128((__m128i *)to + 27, tmp); + _mm_store_si128((__m128i *)to + 28, tmp); + _mm_store_si128((__m128i *)to + 29, tmp); + _mm_store_si128((__m128i *)to + 30, tmp); + _mm_store_si128((__m128i *)to + 31, tmp); + _mm_store_si128((__m128i *)to + 32, tmp); + _mm_store_si128((__m128i *)to + 33, tmp); + _mm_store_si128((__m128i *)to + 34, tmp); + _mm_store_si128((__m128i *)to + 35, tmp); + _mm_store_si128((__m128i *)to + 36, tmp); + _mm_store_si128((__m128i *)to + 37, tmp); + _mm_store_si128((__m128i *)to + 38, tmp); + _mm_store_si128((__m128i *)to + 39, tmp); + _mm_store_si128((__m128i *)to + 40, tmp); + _mm_store_si128((__m128i *)to + 41, tmp); + _mm_store_si128((__m128i *)to + 42, tmp); + _mm_store_si128((__m128i *)to + 43, tmp); + _mm_store_si128((__m128i *)to + 44, tmp); + _mm_store_si128((__m128i *)to + 45, tmp); + _mm_store_si128((__m128i *)to + 46, tmp); + _mm_store_si128((__m128i *)to + 47, tmp); + _mm_store_si128((__m128i *)to + 48, tmp); + _mm_store_si128((__m128i *)to + 49, tmp); + _mm_store_si128((__m128i *)to + 50, tmp); + _mm_store_si128((__m128i *)to + 51, tmp); + _mm_store_si128((__m128i *)to + 52, tmp); + _mm_store_si128((__m128i *)to + 53, tmp); + _mm_store_si128((__m128i *)to + 54, tmp); + _mm_store_si128((__m128i *)to + 55, tmp); + _mm_store_si128((__m128i *)to + 56, tmp); + _mm_store_si128((__m128i *)to + 57, tmp); + _mm_store_si128((__m128i *)to + 58, tmp); + _mm_store_si128((__m128i *)to + 59, tmp); + _mm_store_si128((__m128i *)to + 60, tmp); + _mm_store_si128((__m128i *)to + 61, tmp); + _mm_store_si128((__m128i *)to + 62, tmp); + _mm_store_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x0b: +#ifdef NO_ZEROS + tmp = _mm_load_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_store_si128((__m128i *)to, tmp); + _mm_store_si128((__m128i *)to + 1, tmp); + _mm_store_si128((__m128i *)to + 2, tmp); + _mm_store_si128((__m128i *)to + 3, tmp); + _mm_store_si128((__m128i *)to + 4, tmp); + _mm_store_si128((__m128i *)to + 5, tmp); + _mm_store_si128((__m128i *)to + 6, tmp); + _mm_store_si128((__m128i *)to + 7, tmp); + _mm_store_si128((__m128i *)to + 8, tmp); + _mm_store_si128((__m128i *)to + 9, tmp); + _mm_store_si128((__m128i *)to + 10, tmp); + _mm_store_si128((__m128i *)to + 11, tmp); + _mm_store_si128((__m128i *)to + 12, tmp); + _mm_store_si128((__m128i *)to + 13, tmp); + _mm_store_si128((__m128i *)to + 14, tmp); + _mm_store_si128((__m128i *)to + 15, tmp); + _mm_store_si128((__m128i *)to + 16, tmp); + _mm_store_si128((__m128i *)to + 17, tmp); + _mm_store_si128((__m128i *)to + 18, tmp); + _mm_store_si128((__m128i *)to + 19, tmp); + _mm_store_si128((__m128i *)to + 20, tmp); + _mm_store_si128((__m128i *)to + 21, tmp); + _mm_store_si128((__m128i *)to + 22, tmp); + _mm_store_si128((__m128i *)to + 23, tmp); + _mm_store_si128((__m128i *)to + 24, tmp); + _mm_store_si128((__m128i *)to + 25, tmp); + _mm_store_si128((__m128i *)to + 26, tmp); + _mm_store_si128((__m128i *)to + 27, tmp); + _mm_store_si128((__m128i *)to + 28, tmp); + _mm_store_si128((__m128i *)to + 29, tmp); + _mm_store_si128((__m128i *)to + 30, tmp); + _mm_store_si128((__m128i *)to + 31, tmp); + _mm_store_si128((__m128i *)to + 32, tmp); + _mm_store_si128((__m128i *)to + 33, tmp); + _mm_store_si128((__m128i *)to + 34, tmp); + _mm_store_si128((__m128i *)to + 35, tmp); + _mm_store_si128((__m128i *)to + 36, tmp); + _mm_store_si128((__m128i *)to + 37, tmp); + _mm_store_si128((__m128i *)to + 38, tmp); + _mm_store_si128((__m128i *)to + 39, tmp); + _mm_store_si128((__m128i *)to + 40, tmp); + _mm_store_si128((__m128i *)to + 41, tmp); + _mm_store_si128((__m128i *)to + 42, tmp); + _mm_store_si128((__m128i *)to + 43, tmp); + _mm_store_si128((__m128i *)to + 44, tmp); + _mm_store_si128((__m128i *)to + 45, tmp); + _mm_store_si128((__m128i *)to + 46, tmp); + _mm_store_si128((__m128i *)to + 47, tmp); + _mm_store_si128((__m128i *)to + 48, tmp); + _mm_store_si128((__m128i *)to + 49, tmp); + _mm_store_si128((__m128i *)to + 50, tmp); + _mm_store_si128((__m128i *)to + 51, tmp); + _mm_store_si128((__m128i *)to + 52, tmp); + _mm_store_si128((__m128i *)to + 53, tmp); + _mm_store_si128((__m128i *)to + 54, tmp); + _mm_store_si128((__m128i *)to + 55, tmp); + _mm_store_si128((__m128i *)to + 56, tmp); + _mm_store_si128((__m128i *)to + 57, tmp); + _mm_store_si128((__m128i *)to + 58, tmp); + _mm_store_si128((__m128i *)to + 59, tmp); + _mm_store_si128((__m128i *)to + 60, tmp); + _mm_store_si128((__m128i *)to + 61, tmp); + _mm_store_si128((__m128i *)to + 62, tmp); + _mm_store_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x0c: +#ifdef NO_ZEROS + tmp = _mm_load_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_store_si128((__m128i *)to, tmp); + _mm_store_si128((__m128i *)to + 1, tmp); + _mm_store_si128((__m128i *)to + 2, tmp); + _mm_store_si128((__m128i *)to + 3, tmp); + _mm_store_si128((__m128i *)to + 4, tmp); + _mm_store_si128((__m128i *)to + 5, tmp); + _mm_store_si128((__m128i *)to + 6, tmp); + _mm_store_si128((__m128i *)to + 7, tmp); + _mm_store_si128((__m128i *)to + 8, tmp); + _mm_store_si128((__m128i *)to + 9, tmp); + _mm_store_si128((__m128i *)to + 10, tmp); + _mm_store_si128((__m128i *)to + 11, tmp); + _mm_store_si128((__m128i *)to + 12, tmp); + _mm_store_si128((__m128i *)to + 13, tmp); + _mm_store_si128((__m128i *)to + 14, tmp); + _mm_store_si128((__m128i *)to + 15, tmp); + _mm_store_si128((__m128i *)to + 16, tmp); + _mm_store_si128((__m128i *)to + 17, tmp); + _mm_store_si128((__m128i *)to + 18, tmp); + _mm_store_si128((__m128i *)to + 19, tmp); + _mm_store_si128((__m128i *)to + 20, tmp); + _mm_store_si128((__m128i *)to + 21, tmp); + _mm_store_si128((__m128i *)to + 22, tmp); + _mm_store_si128((__m128i *)to + 23, tmp); + _mm_store_si128((__m128i *)to + 24, tmp); + _mm_store_si128((__m128i *)to + 25, tmp); + _mm_store_si128((__m128i *)to + 26, tmp); + _mm_store_si128((__m128i *)to + 27, tmp); + _mm_store_si128((__m128i *)to + 28, tmp); + _mm_store_si128((__m128i *)to + 29, tmp); + _mm_store_si128((__m128i *)to + 30, tmp); + _mm_store_si128((__m128i *)to + 31, tmp); + _mm_store_si128((__m128i *)to + 32, tmp); + _mm_store_si128((__m128i *)to + 33, tmp); + _mm_store_si128((__m128i *)to + 34, tmp); + _mm_store_si128((__m128i *)to + 35, tmp); + _mm_store_si128((__m128i *)to + 36, tmp); + _mm_store_si128((__m128i *)to + 37, tmp); + _mm_store_si128((__m128i *)to + 38, tmp); + _mm_store_si128((__m128i *)to + 39, tmp); + _mm_store_si128((__m128i *)to + 40, tmp); + _mm_store_si128((__m128i *)to + 41, tmp); + _mm_store_si128((__m128i *)to + 42, tmp); + _mm_store_si128((__m128i *)to + 43, tmp); + _mm_store_si128((__m128i *)to + 44, tmp); + _mm_store_si128((__m128i *)to + 45, tmp); + _mm_store_si128((__m128i *)to + 46, tmp); + _mm_store_si128((__m128i *)to + 47, tmp); + _mm_store_si128((__m128i *)to + 48, tmp); + _mm_store_si128((__m128i *)to + 49, tmp); + _mm_store_si128((__m128i *)to + 50, tmp); + _mm_store_si128((__m128i *)to + 51, tmp); + _mm_store_si128((__m128i *)to + 52, tmp); + _mm_store_si128((__m128i *)to + 53, tmp); + _mm_store_si128((__m128i *)to + 54, tmp); + _mm_store_si128((__m128i *)to + 55, tmp); + _mm_store_si128((__m128i *)to + 56, tmp); + _mm_store_si128((__m128i *)to + 57, tmp); + _mm_store_si128((__m128i *)to + 58, tmp); + _mm_store_si128((__m128i *)to + 59, tmp); + _mm_store_si128((__m128i *)to + 60, tmp); + _mm_store_si128((__m128i *)to + 61, tmp); + _mm_store_si128((__m128i *)to + 62, tmp); + _mm_store_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x0d: +#ifdef NO_ZEROS + tmp = _mm_load_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_store_si128((__m128i *)to, tmp); + _mm_store_si128((__m128i *)to + 1, tmp); + _mm_store_si128((__m128i *)to + 2, tmp); + _mm_store_si128((__m128i *)to + 3, tmp); + _mm_store_si128((__m128i *)to + 4, tmp); + _mm_store_si128((__m128i *)to + 5, tmp); + _mm_store_si128((__m128i *)to + 6, tmp); + _mm_store_si128((__m128i *)to + 7, tmp); + _mm_store_si128((__m128i *)to + 8, tmp); + _mm_store_si128((__m128i *)to + 9, tmp); + _mm_store_si128((__m128i *)to + 10, tmp); + _mm_store_si128((__m128i *)to + 11, tmp); + _mm_store_si128((__m128i *)to + 12, tmp); + _mm_store_si128((__m128i *)to + 13, tmp); + _mm_store_si128((__m128i *)to + 14, tmp); + _mm_store_si128((__m128i *)to + 15, tmp); + _mm_store_si128((__m128i *)to + 16, tmp); + _mm_store_si128((__m128i *)to + 17, tmp); + _mm_store_si128((__m128i *)to + 18, tmp); + _mm_store_si128((__m128i *)to + 19, tmp); + _mm_store_si128((__m128i *)to + 20, tmp); + _mm_store_si128((__m128i *)to + 21, tmp); + _mm_store_si128((__m128i *)to + 22, tmp); + _mm_store_si128((__m128i *)to + 23, tmp); + _mm_store_si128((__m128i *)to + 24, tmp); + _mm_store_si128((__m128i *)to + 25, tmp); + _mm_store_si128((__m128i *)to + 26, tmp); + _mm_store_si128((__m128i *)to + 27, tmp); + _mm_store_si128((__m128i *)to + 28, tmp); + _mm_store_si128((__m128i *)to + 29, tmp); + _mm_store_si128((__m128i *)to + 30, tmp); + _mm_store_si128((__m128i *)to + 31, tmp); + _mm_store_si128((__m128i *)to + 32, tmp); + _mm_store_si128((__m128i *)to + 33, tmp); + _mm_store_si128((__m128i *)to + 34, tmp); + _mm_store_si128((__m128i *)to + 35, tmp); + _mm_store_si128((__m128i *)to + 36, tmp); + _mm_store_si128((__m128i *)to + 37, tmp); + _mm_store_si128((__m128i *)to + 38, tmp); + _mm_store_si128((__m128i *)to + 39, tmp); + _mm_store_si128((__m128i *)to + 40, tmp); + _mm_store_si128((__m128i *)to + 41, tmp); + _mm_store_si128((__m128i *)to + 42, tmp); + _mm_store_si128((__m128i *)to + 43, tmp); + _mm_store_si128((__m128i *)to + 44, tmp); + _mm_store_si128((__m128i *)to + 45, tmp); + _mm_store_si128((__m128i *)to + 46, tmp); + _mm_store_si128((__m128i *)to + 47, tmp); + _mm_store_si128((__m128i *)to + 48, tmp); + _mm_store_si128((__m128i *)to + 49, tmp); + _mm_store_si128((__m128i *)to + 50, tmp); + _mm_store_si128((__m128i *)to + 51, tmp); + _mm_store_si128((__m128i *)to + 52, tmp); + _mm_store_si128((__m128i *)to + 53, tmp); + _mm_store_si128((__m128i *)to + 54, tmp); + _mm_store_si128((__m128i *)to + 55, tmp); + _mm_store_si128((__m128i *)to + 56, tmp); + _mm_store_si128((__m128i *)to + 57, tmp); + _mm_store_si128((__m128i *)to + 58, tmp); + _mm_store_si128((__m128i *)to + 59, tmp); + _mm_store_si128((__m128i *)to + 60, tmp); + _mm_store_si128((__m128i *)to + 61, tmp); + _mm_store_si128((__m128i *)to + 62, tmp); + _mm_store_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x0e: +#ifdef NO_ZEROS + tmp = _mm_load_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_store_si128((__m128i *)to, tmp); + _mm_store_si128((__m128i *)to + 1, tmp); + _mm_store_si128((__m128i *)to + 2, tmp); + _mm_store_si128((__m128i *)to + 3, tmp); + _mm_store_si128((__m128i *)to + 4, tmp); + _mm_store_si128((__m128i *)to + 5, tmp); + _mm_store_si128((__m128i *)to + 6, tmp); + _mm_store_si128((__m128i *)to + 7, tmp); + _mm_store_si128((__m128i *)to + 8, tmp); + _mm_store_si128((__m128i *)to + 9, tmp); + _mm_store_si128((__m128i *)to + 10, tmp); + _mm_store_si128((__m128i *)to + 11, tmp); + _mm_store_si128((__m128i *)to + 12, tmp); + _mm_store_si128((__m128i *)to + 13, tmp); + _mm_store_si128((__m128i *)to + 14, tmp); + _mm_store_si128((__m128i *)to + 15, tmp); + _mm_store_si128((__m128i *)to + 16, tmp); + _mm_store_si128((__m128i *)to + 17, tmp); + _mm_store_si128((__m128i *)to + 18, tmp); + _mm_store_si128((__m128i *)to + 19, tmp); + _mm_store_si128((__m128i *)to + 20, tmp); + _mm_store_si128((__m128i *)to + 21, tmp); + _mm_store_si128((__m128i *)to + 22, tmp); + _mm_store_si128((__m128i *)to + 23, tmp); + _mm_store_si128((__m128i *)to + 24, tmp); + _mm_store_si128((__m128i *)to + 25, tmp); + _mm_store_si128((__m128i *)to + 26, tmp); + _mm_store_si128((__m128i *)to + 27, tmp); + _mm_store_si128((__m128i *)to + 28, tmp); + _mm_store_si128((__m128i *)to + 29, tmp); + _mm_store_si128((__m128i *)to + 30, tmp); + _mm_store_si128((__m128i *)to + 31, tmp); + _mm_store_si128((__m128i *)to + 32, tmp); + _mm_store_si128((__m128i *)to + 33, tmp); + _mm_store_si128((__m128i *)to + 34, tmp); + _mm_store_si128((__m128i *)to + 35, tmp); + _mm_store_si128((__m128i *)to + 36, tmp); + _mm_store_si128((__m128i *)to + 37, tmp); + _mm_store_si128((__m128i *)to + 38, tmp); + _mm_store_si128((__m128i *)to + 39, tmp); + _mm_store_si128((__m128i *)to + 40, tmp); + _mm_store_si128((__m128i *)to + 41, tmp); + _mm_store_si128((__m128i *)to + 42, tmp); + _mm_store_si128((__m128i *)to + 43, tmp); + _mm_store_si128((__m128i *)to + 44, tmp); + _mm_store_si128((__m128i *)to + 45, tmp); + _mm_store_si128((__m128i *)to + 46, tmp); + _mm_store_si128((__m128i *)to + 47, tmp); + _mm_store_si128((__m128i *)to + 48, tmp); + _mm_store_si128((__m128i *)to + 49, tmp); + _mm_store_si128((__m128i *)to + 50, tmp); + _mm_store_si128((__m128i *)to + 51, tmp); + _mm_store_si128((__m128i *)to + 52, tmp); + _mm_store_si128((__m128i *)to + 53, tmp); + _mm_store_si128((__m128i *)to + 54, tmp); + _mm_store_si128((__m128i *)to + 55, tmp); + _mm_store_si128((__m128i *)to + 56, tmp); + _mm_store_si128((__m128i *)to + 57, tmp); + _mm_store_si128((__m128i *)to + 58, tmp); + _mm_store_si128((__m128i *)to + 59, tmp); + _mm_store_si128((__m128i *)to + 60, tmp); + _mm_store_si128((__m128i *)to + 61, tmp); + _mm_store_si128((__m128i *)to + 62, tmp); + _mm_store_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x0f: +#ifdef NO_ZEROS + tmp = _mm_load_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_store_si128((__m128i *)to, tmp); + _mm_store_si128((__m128i *)to + 1, tmp); + _mm_store_si128((__m128i *)to + 2, tmp); + _mm_store_si128((__m128i *)to + 3, tmp); + _mm_store_si128((__m128i *)to + 4, tmp); + _mm_store_si128((__m128i *)to + 5, tmp); + _mm_store_si128((__m128i *)to + 6, tmp); + _mm_store_si128((__m128i *)to + 7, tmp); + _mm_store_si128((__m128i *)to + 8, tmp); + _mm_store_si128((__m128i *)to + 9, tmp); + _mm_store_si128((__m128i *)to + 10, tmp); + _mm_store_si128((__m128i *)to + 11, tmp); + _mm_store_si128((__m128i *)to + 12, tmp); + _mm_store_si128((__m128i *)to + 13, tmp); + _mm_store_si128((__m128i *)to + 14, tmp); + _mm_store_si128((__m128i *)to + 15, tmp); + _mm_store_si128((__m128i *)to + 16, tmp); + _mm_store_si128((__m128i *)to + 17, tmp); + _mm_store_si128((__m128i *)to + 18, tmp); + _mm_store_si128((__m128i *)to + 19, tmp); + _mm_store_si128((__m128i *)to + 20, tmp); + _mm_store_si128((__m128i *)to + 21, tmp); + _mm_store_si128((__m128i *)to + 22, tmp); + _mm_store_si128((__m128i *)to + 23, tmp); + _mm_store_si128((__m128i *)to + 24, tmp); + _mm_store_si128((__m128i *)to + 25, tmp); + _mm_store_si128((__m128i *)to + 26, tmp); + _mm_store_si128((__m128i *)to + 27, tmp); + _mm_store_si128((__m128i *)to + 28, tmp); + _mm_store_si128((__m128i *)to + 29, tmp); + _mm_store_si128((__m128i *)to + 30, tmp); + _mm_store_si128((__m128i *)to + 31, tmp); + _mm_store_si128((__m128i *)to + 32, tmp); + _mm_store_si128((__m128i *)to + 33, tmp); + _mm_store_si128((__m128i *)to + 34, tmp); + _mm_store_si128((__m128i *)to + 35, tmp); + _mm_store_si128((__m128i *)to + 36, tmp); + _mm_store_si128((__m128i *)to + 37, tmp); + _mm_store_si128((__m128i *)to + 38, tmp); + _mm_store_si128((__m128i *)to + 39, tmp); + _mm_store_si128((__m128i *)to + 40, tmp); + _mm_store_si128((__m128i *)to + 41, tmp); + _mm_store_si128((__m128i *)to + 42, tmp); + _mm_store_si128((__m128i *)to + 43, tmp); + _mm_store_si128((__m128i *)to + 44, tmp); + _mm_store_si128((__m128i *)to + 45, tmp); + _mm_store_si128((__m128i *)to + 46, tmp); + _mm_store_si128((__m128i *)to + 47, tmp); + _mm_store_si128((__m128i *)to + 48, tmp); + _mm_store_si128((__m128i *)to + 49, tmp); + _mm_store_si128((__m128i *)to + 50, tmp); + _mm_store_si128((__m128i *)to + 51, tmp); + _mm_store_si128((__m128i *)to + 52, tmp); + _mm_store_si128((__m128i *)to + 53, tmp); + _mm_store_si128((__m128i *)to + 54, tmp); + _mm_store_si128((__m128i *)to + 55, tmp); + _mm_store_si128((__m128i *)to + 56, tmp); + _mm_store_si128((__m128i *)to + 57, tmp); + _mm_store_si128((__m128i *)to + 58, tmp); + _mm_store_si128((__m128i *)to + 59, tmp); + _mm_store_si128((__m128i *)to + 60, tmp); + _mm_store_si128((__m128i *)to + 61, tmp); + _mm_store_si128((__m128i *)to + 62, tmp); + _mm_store_si128((__m128i *)to + 63, tmp); + to += 256; + break; + case 0x10: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x11: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x12: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x13: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x14: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x15: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x16: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x17: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x18: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x19: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x1a: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x1b: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x1c: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x1d: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x1e: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x1f: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_store_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + break; + case 0x20: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x21: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x22: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x23: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x24: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x25: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x26: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x27: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x28: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x29: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x2a: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x2b: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x2c: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x2d: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x2e: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x2f: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_store_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + break; + case 0x30: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x31: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x32: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x33: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x34: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x35: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x36: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x37: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x38: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x39: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x3a: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x3b: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x3c: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x3d: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x3e: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x3f: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_store_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + break; + case 0x40: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x41: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x42: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x43: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x44: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x45: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x46: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x47: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x48: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x49: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x4a: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x4b: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x4c: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x4d: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x4e: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x4f: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + break; + case 0x50: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x51: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x52: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x53: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x54: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x55: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x56: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x57: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x58: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x59: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x5a: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x5b: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x5c: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x5d: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x5e: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x5f: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + break; + case 0x60: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x61: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x62: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x63: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x64: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x65: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x66: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x67: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x68: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x69: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x6a: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x6b: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x6c: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x6d: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x6e: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x6f: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + break; + case 0x70: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x71: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x72: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x73: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x74: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x75: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x76: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x77: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x78: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x79: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x7a: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x7b: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x7c: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x7d: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x7e: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x7f: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_store_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + break; + case 0x80: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_store_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_store_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_store_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x81: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_store_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_store_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_store_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x82: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_store_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_store_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_store_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x83: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_store_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_store_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_store_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x84: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_store_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_store_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_store_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x85: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_store_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_store_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_store_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x86: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_store_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_store_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_store_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x87: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_store_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_store_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_store_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x88: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_store_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_store_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_store_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x89: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_store_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_store_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_store_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x8a: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_store_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_store_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_store_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x8b: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_store_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_store_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_store_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x8c: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_store_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_store_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_store_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x8d: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_store_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_store_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_store_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x8e: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_store_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_store_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_store_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x8f: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_store_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_store_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_store_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + break; + case 0x90: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x91: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x92: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x93: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x94: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x95: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x96: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x97: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x98: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x99: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x9a: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x9b: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x9c: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x9d: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x9e: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x9f: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_store_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + break; + case 0xa0: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xa1: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xa2: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xa3: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xa4: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xa5: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xa6: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xa7: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xa8: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xa9: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xaa: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xab: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xac: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xad: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xae: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xaf: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + break; + case 0xb0: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xb1: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xb2: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xb3: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xb4: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xb5: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xb6: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xb7: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xb8: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xb9: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xba: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xbb: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xbc: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xbd: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xbe: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xbf: + byte_stream = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_store_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_load_si128((__m128i *)in + 1); + _mm_store_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_store_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_store_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + break; + case 0xc0: + tmp = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_store_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xc1: + tmp = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_store_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xc2: + tmp = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_store_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xc3: + tmp = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_store_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xc4: + tmp = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_store_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xc5: + tmp = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_store_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xc6: + tmp = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_store_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xc7: + tmp = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_store_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xc8: + tmp = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_store_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xc9: + tmp = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_store_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xca: + tmp = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_store_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xcb: + tmp = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_store_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xcc: + tmp = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_store_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xcd: + tmp = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_store_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xce: + tmp = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_store_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xcf: + tmp = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_store_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + break; + case 0xd0: + byte_stream = _mm_load_si128((__m128i *)in); _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_load_si128((__m128i *)in + 1); _mm_store_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_store_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; + to += 12; + case 0xd1: + byte_stream = _mm_load_si128((__m128i *)in); _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_load_si128((__m128i *)in + 1); _mm_store_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_store_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; + to += 12; + case 0xd2: + byte_stream = _mm_load_si128((__m128i *)in); _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_load_si128((__m128i *)in + 1); _mm_store_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_store_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; + to += 12; + case 0xd3: + byte_stream = _mm_load_si128((__m128i *)in); _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_load_si128((__m128i *)in + 1); _mm_store_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_store_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; + to += 12; + case 0xd4: + byte_stream = _mm_load_si128((__m128i *)in); _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_load_si128((__m128i *)in + 1); _mm_store_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_store_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; + to += 12; + case 0xd5: + byte_stream = _mm_load_si128((__m128i *)in); _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_load_si128((__m128i *)in + 1); _mm_store_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_store_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; + to += 12; + case 0xd6: + byte_stream = _mm_load_si128((__m128i *)in); _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_load_si128((__m128i *)in + 1); _mm_store_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_store_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; + to += 12; + case 0xd7: + byte_stream = _mm_load_si128((__m128i *)in); _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_load_si128((__m128i *)in + 1); _mm_store_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_store_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; + to += 12; + case 0xd8: + byte_stream = _mm_load_si128((__m128i *)in); _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_load_si128((__m128i *)in + 1); _mm_store_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_store_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; + to += 12; + case 0xd9: + byte_stream = _mm_load_si128((__m128i *)in); _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_load_si128((__m128i *)in + 1); _mm_store_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_store_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; + to += 12; + case 0xda: + byte_stream = _mm_load_si128((__m128i *)in); _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_load_si128((__m128i *)in + 1); _mm_store_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_store_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; + to += 12; + case 0xdb: + byte_stream = _mm_load_si128((__m128i *)in); _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_load_si128((__m128i *)in + 1); _mm_store_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_store_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; + to += 12; + case 0xdc: + byte_stream = _mm_load_si128((__m128i *)in); _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_load_si128((__m128i *)in + 1); _mm_store_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_store_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; + to += 12; + case 0xdd: + byte_stream = _mm_load_si128((__m128i *)in); _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_load_si128((__m128i *)in + 1); _mm_store_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_store_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; + to += 12; + case 0xde: + byte_stream = _mm_load_si128((__m128i *)in); _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_load_si128((__m128i *)in + 1); _mm_store_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_store_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; + to += 12; + case 0xdf: + byte_stream = _mm_load_si128((__m128i *)in); _mm_store_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_load_si128((__m128i *)in + 1); _mm_store_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_store_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; + to += 12; + break; + case 0xe0: + tmp = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xe1: + tmp = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xe2: + tmp = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xe3: + tmp = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xe4: + tmp = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xe5: + tmp = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xe6: + tmp = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xe7: + tmp = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xe8: + tmp = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xe9: + tmp = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xea: + tmp = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xeb: + tmp = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xec: + tmp = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xed: + tmp = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xee: + tmp = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xef: + tmp = _mm_load_si128((__m128i *)in); + _mm_store_si128((__m128i *)to, tmp); + in += 16; + to += 4; + break; + case 0xf0: + in++; + case 0xf1: + in++; + case 0xf2: + in++; + case 0xf3: + in++; + case 0xf4: + in++; + case 0xf5: + in++; + case 0xf6: + in++; + case 0xf7: + in++; + case 0xf8: + in++; + case 0xf9: + in++; + case 0xfa: + in++; + case 0xfb: + in++; + case 0xfc: + in++; + case 0xfd: + in++; + case 0xfe: + in++; + case 0xff: + in++; + break; + } + } +} diff --git a/ext/bench_/bench/compress_qmx_adcs.h b/ext/bench_/bench/compress_qmx_adcs.h new file mode 100644 index 0000000..8369c75 --- /dev/null +++ b/ext/bench_/bench/compress_qmx_adcs.h @@ -0,0 +1,42 @@ +/* + COMPRESS_QMX_ADCS_ADCS.H + -------------- +*/ +#ifndef COMPRESS_QMX_ADCS_H_ +#define COMPRESS_QMX_ADCS_H_ + +#include +#include "compress.h" + +/* + class ANT_COMPRESS_QMX_ADCS + ------------------ +*/ +class ANT_compress_qmx_adcs : public ANT_compress +{ +private: + uint8_t *length_buffer; + uint64_t length_buffer_length; + +public: + ANT_compress_qmx_adcs(); + virtual ~ANT_compress_qmx_adcs(); + + void encodeArray(const uint32_t *in, uint64_t len, uint32_t *out, uint64_t *nvalue); + void decodeArray(const uint32_t *in, uint64_t len, uint32_t *out, uint64_t nvalue); + + virtual uint64_t compress(uint8_t *destination, uint64_t destination_length, uint32_t *source, uint64_t source_integers) + { + uint64_t answer; + encodeArray(source, source_integers, (uint32_t *)destination, &answer); + return answer; + } + + virtual void decompress(uint32_t *destination, uint64_t destinaton_integers, uint8_t *source, uint64_t source_length) + { + decodeArray((uint32_t *)source, source_length, destination, destinaton_integers); + } +} ; + +#endif + diff --git a/ext/qmx/compress_qmx.cc b/ext/bench_/bench/compress_qmx_decompress.cpp similarity index 80% rename from ext/qmx/compress_qmx.cc rename to ext/bench_/bench/compress_qmx_decompress.cpp index 30c6c02..ecb2757 100644 --- a/ext/qmx/compress_qmx.cc +++ b/ext/bench_/bench/compress_qmx_decompress.cpp @@ -1,6730 +1,5450 @@ -/* - COMPRESS_QMX.C - -------------- - Copyright (c) 2014 by Andrew Trotman - Licensed BSD - - A version of BinPacking where we pack into a 128-bit SSE register the following: - 256 0-bit words - 128 1-bit words - 64 2-bit words - 40 3-bit words - 32 4-bit words - 24 5-bit words - 20 6-bit words - 16 8-bit words - 12 10-bit words - 8 16-bit words - 4 32-bit words - or pack into two 128-bit words (i.e. 256 bits) the following: - 36 7-bit words - 28 9-bit words - 20 12-bit words - 12 21-bit words - - This gives us 15 possible combinations. The combinaton is stored in the top 4 bits of a selector byte. The - bottom 4-bits of the selector store a run-length (the number of such sequences seen in a row. - - The 128-bit (or 256-bit) packed binary values are stored first. Then we store the selectors, Finally, - stored variable byte encoded, is a pointer to the start of the selector (from the end of the sequence). - - This way, all reads and writes are 128-bit word aligned, except addressing the selector (and the pointer - the selector). These reads are byte aligned. - - Note: There is currently 1 unused encoding (i.e. 16 unused selecvtor values). These might in the future be - used for encoding exceptions, much as PForDelta does. -*/ -#include -#include -#include -#include -#include -#include "compress_qmx.h" -/* - class COMPRESS_QMX - ------------------ -*/ -class compress_qmx -{ -private: - uint8_t *length_buffer; - uint64_t length_buffer_length; - -public: - compress_qmx(); - virtual ~compress_qmx(); - - virtual void encodeArray(const uint32_t *in, uint64_t len, uint32_t *out, uint64_t *nvalue); - virtual void decodeArray(const uint32_t *in, uint64_t len, uint32_t *out, uint64_t nvalue); -} ; - -//#define MAKE_DECOMPRESS 1 /* uncomment this and it will create a program that writes the decompressor */ -//#define TEST_ONE_STRING 1 /* Uncomment this and it will create a program that can be used to test the compressor and decompressor */ -#define NO_ZEROS 1 /* stores runs of 256 1s in a row (not 1-bit number, but actual 1 values). */ -#define SHORT_END_BLOCKS 1 - -#ifdef _MSC_VER - #define ALIGN_16 __declspec(align(16)) -#else - #define ALIGN_16 __attribute__ ((aligned (16))) -#endif - -//#define STATS /* uncomment this and it will count the selector usage */ -#ifdef STATS - static uint32_t stats[65] = {0}; -#endif - -/* - COMPRESS_QMX::COMPRESS_QMX() - ---------------------------- -*/ -compress_qmx::compress_qmx() -{ -length_buffer = NULL; -length_buffer_length = 0; -} - -/* - COMPRESS_QMX::!COMPRESS_QMX() - ----------------------------- -*/ -compress_qmx::~compress_qmx() -{ -delete [] length_buffer; -#ifdef STATS - uint32_t which; - for (which = 0; which <= 32; which++) - if (stats[which] != 0) - printf("%d\t%d\ttimes\n", which, stats[which]); -#endif -} - -/* - BITS_NEEDED_FOR() - ----------------- -*/ -static uint8_t bits_needed_for(uint32_t value) -{ -if (value == 0x01) - return 0; -else if (value <= 0x01) - return 1; -else if (value <= 0x03) - return 2; -else if (value <= 0x07) - return 3; -else if (value <= 0x0F) - return 4; -else if (value <= 0x1F) - return 5; -else if (value <= 0x3F) - return 6; -else if (value <= 0x7F) - return 7; -else if (value <= 0xFF) - return 8; -else if (value <= 0x1FF) - return 9; -else if (value <= 0x3FF) - return 10; -else if (value <= 0xFFF) - return 12; -else if (value <= 0xFFFF) - return 16; -else if (value <= 0x1FFFFF) - return 21; -else - return 32; -} - -/* - VBYTE_BYTES_NEEDED_FOR() - ------------------------ -*/ -static inline uint32_t vbyte_bytes_needed_for(uint32_t docno) -{ -if (docno < (1 << 7)) - return 1; -else if (docno < (1 << 14)) - return 2; -else if (docno < (1 << 21)) - return 3; -else if (docno < (1 << 28)) - return 4; -else - return 5; -} - -/* - VBYTE_COMPRESS_INTO() - --------------------- - NOTE: We compress "backwards" because we want to keep decompressing from the end of the string - to get the number -*/ -static inline void vbyte_compress_into(uint8_t *dest, uint32_t docno) -{ -if (docno < (1 << 7)) - dest[0] = (docno & 0x7F) | 0x80; -else if (docno < (1 << 14)) - { - dest[1] = (docno >> 7) & 0x7F; - dest[0] = (docno & 0x7F) | 0x80; - } -else if (docno < (1 << 21)) - { - dest[2] = (docno >> 14) & 0x7F; - dest[1] = (docno >> 7) & 0x7F; - dest[0] = (docno & 0x7F) | 0x80; - } -else if (docno < (1 << 28)) - { - dest[3] = (docno >> 21) & 0x7F; - dest[2] = (docno >> 14) & 0x7F; - dest[1] = (docno >> 7) & 0x7F; - dest[0] = (docno & 0x7F) | 0x80; - } -else - { - dest[4] = (docno >> 28) & 0x7F; - dest[3] = (docno >> 21) & 0x7F; - dest[2] = (docno >> 14) & 0x7F; - dest[1] = (docno >> 7) & 0x7F; - dest[0] = (docno & 0x7F) | 0x80; - } -} - -/* - VBYTE_DECOMPRESS() - ------------------ - NOTE: this method is given a ponter to the end of the v-byte compressed - integer. The task is to work backwards until it gets the integer -*/ -static inline uint32_t vbyte_decompress(uint8_t *source) -{ -uint32_t result; - -if (*source & 0x80) - return *source & 0x7F; -else - { - result = *source--; - - while (!(*source & 0x80)) - result = (result << 7) | *source--; - - return (result << 7) | (*source & 0x7F); - } -} - -/* - WRITE_OUT() - ----------- -*/ -static void write_out(uint8_t **buffer, uint32_t *source, uint32_t raw_count, uint32_t size_in_bits, uint8_t **length_buffer) -{ -uint32_t current, batch; -uint8_t *destination = *buffer; -uint32_t *end = source + raw_count; -uint8_t *key_store = *length_buffer; -uint32_t ALIGN_16 sequence_buffer[4]; -uint32_t instance, value; -uint8_t type; -uint32_t count; - -#ifdef STATS - stats[size_in_bits] += raw_count; -#endif - -if (size_in_bits == 0) - { - type = 0; - count = (raw_count + 255) / 256; - } -else if (size_in_bits == 1) - { - type = 1; // 1 bit per integer - count = (raw_count + 127) / 128; - } -else if (size_in_bits == 2) - { - type = 2; // 2 bits per integer - count = (raw_count + 63) / 64; - } -else if (size_in_bits == 3) - { - type = 3; // 3 bits per integer - count = (raw_count + 39) / 40; - } -else if (size_in_bits == 4) - { - type = 4; // 4 bits per integer - count = (raw_count + 31) / 32; - } -else if (size_in_bits == 5) - { - type = 5; // 5 bits per integer - count = (raw_count + 23) / 24; - } -else if (size_in_bits == 6) - { - type = 6; // 6 bits per integer - count = (raw_count + 19) / 20; - } -else if (size_in_bits == 7) - { - type = 7; // 7 bits per integer, 18 integers per read (but requires 2 reads) - count = (raw_count + 35) / 36; - } -else if (size_in_bits == 8) - { - type = 8; // 8 bits per integer - count = (raw_count + 15) / 16; - } -else if (size_in_bits == 9) - { - type = 9; // 9 bits per integer, 14 integers per read (but requires 2 reads) - count = (raw_count + 27) / 28; - } -else if (size_in_bits == 10) - { - type = 10; // 10 bits per integer - count = (raw_count + 11) / 12; - } -else if (size_in_bits == 12) - { - type = 11; // 12 bits per integer, 10 integers per read (but requires 2 reads) - count = (raw_count + 19) / 20; - } -else if (size_in_bits == 16) - { - type = 12; // 16 bits per integer - count = (raw_count + 7) / 8; - } -else if (size_in_bits == 21) - { - type = 13; // 21 bits per integer, 6 integers per read (but requires 2 reads) - count = (raw_count + 11) / 12; - } -else if (size_in_bits == 32) - { - type = 14; // 32 bits per integer - count = (raw_count + 3) / 4; - } -else - exit(printf("Can't compress into integers of size %dbits\n", size_in_bits)); - -while (count > 0) - { - batch = count > 16 ? 16 : count; - *key_store++ = (type << 4) | (~(batch - 1) & 0x0F); - - count -= batch; - - for (current = 0; current < batch; current++) - { - switch (size_in_bits) - { - case 0: // 0 bits per integer (i.e. a long sequence of zeros) - /* - In this case we don't need to store a 4 byte integer because its implicit - */ - source += 256; - break; - case 1: // 1 bit per integer - memset(sequence_buffer, 0, sizeof(sequence_buffer)); - for (value = 0; value < 128; value++) - sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 1); - - memcpy(destination, sequence_buffer, 16); - destination += 16; - source += 128; - break; - case 2: // 2 bits per integer - memset(sequence_buffer, 0, sizeof(sequence_buffer)); - for (value = 0; value < 64; value++) - sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 2); - - memcpy(destination, sequence_buffer, 16); - destination += 16; - source += 64; - break; - case 3: // 3 bits per integer - memset(sequence_buffer, 0, sizeof(sequence_buffer)); - for (value = 0; value < 40; value++) - sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 3); - - memcpy(destination, sequence_buffer, 16); - destination += 16; - source += 40; - break; - case 4: // 4 bits per integer - memset(sequence_buffer, 0, sizeof(sequence_buffer)); - for (value = 0; value < 32; value++) - sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 4); - - memcpy(destination, sequence_buffer, 16); - destination += 16; - source += 32; - break; - case 5: // 5 bits per integer - memset(sequence_buffer, 0, sizeof(sequence_buffer)); - for (value = 0; value < 24; value++) - sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 5); - - memcpy(destination, sequence_buffer, 16); - destination += 16; - source += 24; - break; - case 6: // 6 bits per integer - memset(sequence_buffer, 0, sizeof(sequence_buffer)); - for (value = 0; value < 20; value++) - sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 6); - memcpy(destination, sequence_buffer, 16); - destination += 16; - source += 20; - break; - case 7: // 7 bits per integer - memset(sequence_buffer, 0, sizeof(sequence_buffer)); - for (value = 0; value < 20; value++) - sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 7); - memcpy(destination, sequence_buffer, 16); - destination += 16; - - memset(sequence_buffer, 0, sizeof(sequence_buffer)); - for (value = 16; value < 20; value++) - sequence_buffer[value & 0x03] |= source[value] >> 4; - for (value = 20; value < 36; value++) - sequence_buffer[value & 0x03] |= source[value] << (((value - 20) / 4) * 7 + 3); - memcpy(destination, sequence_buffer, 16); - - destination += 16; - source += 36; // 36 in a double 128-bit word - break; - case 8: // 8 bits per integer -#ifdef SHORT_END_BLOCKS - for (instance = 0; instance < 16 && source < end; instance++) -#else - for (instance = 0; instance < 16; instance++) -#endif - *destination++ = (uint8_t)*source++; - break; - case 9: // 9 bits per integer - memset(sequence_buffer, 0, sizeof(sequence_buffer)); - for (value = 0; value < 16; value++) - sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 9); - memcpy(destination, sequence_buffer, 16); - destination += 16; - - memset(sequence_buffer, 0, sizeof(sequence_buffer)); - for (value = 12; value < 16; value++) - sequence_buffer[value & 0x03] |= source[value] >> 5; - for (value = 16; value < 28; value++) - sequence_buffer[value & 0x03] |= source[value] << (((value - 16) / 4) * 9 + 4); - memcpy(destination, sequence_buffer, 16); - - destination += 16; - source += 28; // 28 in a double 128-bit word - break; - case 10: // 10 bits per integer - memset(sequence_buffer, 0, sizeof(sequence_buffer)); - for (value = 0; value < 12; value++) - sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 10); - - memcpy(destination, sequence_buffer, 16); - destination += 16; - source += 12; - break; - case 12: // 12 bit integers - memset(sequence_buffer, 0, sizeof(sequence_buffer)); - for (value = 0; value < 12; value++) - sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 12); - memcpy(destination, sequence_buffer, 16); - destination += 16; - - memset(sequence_buffer, 0, sizeof(sequence_buffer)); - for (value = 8; value < 12; value++) - sequence_buffer[value & 0x03] |= source[value] >> 8; - for (value = 12; value < 20; value++) - sequence_buffer[value & 0x03] |= source[value] << (((value - 12) / 4) * 12 + 8); - memcpy(destination, sequence_buffer, 16); - - destination += 16; - source += 20; // 20 in a double 128-bit word - break; - case 16: // 16 bits per integer -#ifdef SHORT_END_BLOCKS - for (instance = 0; instance < 8 && source < end; instance++) -#else - for (instance = 0; instance < 8; instance++) -#endif - { - *(uint16_t *)destination = (uint16_t)*source++; - destination += 2; - } - break; - case 21: // 21 bits per integer - memset(sequence_buffer, 0, sizeof(sequence_buffer)); - for (value = 0; value < 8; value++) - sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 21); - memcpy(destination, sequence_buffer, 16); - destination += 16; - - memset(sequence_buffer, 0, sizeof(sequence_buffer)); - for (value = 4; value < 8; value++) - sequence_buffer[value & 0x03] |= source[value] >> 11; - for (value = 8; value < 12; value++) - sequence_buffer[value & 0x03] |= source[value] << (((value - 8) / 4) * 21 + 11); - memcpy(destination, sequence_buffer, 16); - - destination += 16; - source += 12; // 12 in a double 128-bit word - break; - case 32: // 32 bits per integer -#ifdef SHORT_END_BLOCKS - for (instance = 0; instance < 4 && source < end; instance++) -#else - for (instance = 0; instance < 4; instance++) -#endif - { - *(uint32_t *)destination = (uint32_t)*source++; - destination += 4; - } - break; - } - } - } -*buffer = destination; -*length_buffer = key_store; -} - -/* - MAX() - ----- -*/ -template -T max(T a, T b) -{ -return a > b ? a : b; -} - -/* - MAX() - ----- -*/ -template -T max(T a, T b, T c, T d) -{ -return max(max(a, b), max(c, d)); -} - -/* - COMPRESS_QMX::ENCODEARRAY() - --------------------------- -*/ -void compress_qmx::encodeArray(const uint32_t *source, uint64_t source_integers, uint32_t *into, uint64_t *nvalue) -{ -const uint32_t WASTAGE = 512; -uint8_t *current_length, *destination = (uint8_t *)into, *keys; -uint32_t *current, run_length, bits, new_needed, wastage; -uint32_t block, largest; - -/* - make sure we have enough room to store the lengths -*/ -if (length_buffer_length < source_integers) - { - delete [] length_buffer; - length_buffer = new uint8_t [(size_t)((length_buffer_length = source_integers) + WASTAGE)]; - } - -/* - Get the lengths of the integers -*/ -current_length = length_buffer; -for (current = (uint32_t *)source; current < source + source_integers; current++) - *current_length++ = bits_needed_for(*current); - -/* - Shove a bunch of 0 length integers on the end to allow for overflow -*/ -for (wastage = 0; wastage < WASTAGE; wastage++) - *current_length++ = 0; - -/* - Process the lengths. To maximise SSE throughput we need each write to be 128-bit (4*32-bit) alignned - and therefore we need each compress "block" to be the same size where a compress "block" is a set of - four encoded integers starting on a 4-integer boundary. -*/ -for (current_length = length_buffer; current_length < length_buffer + source_integers + 4; current_length += 4) - *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = max(*current_length, *(current_length + 1), *(current_length + 2), *(current_length + 3)); - -/* - This code makes sure we can do aligned reads, promoting to larger integers if necessary -*/ -current_length = length_buffer; -while (current_length < length_buffer + source_integers) - { -#ifdef SHORT_END_BLOCKS - /* - If there are fewer than 16 values remaining and they all fit into 8-bits then its smaller than storing stripes - If there are fewer than 8 values remaining and they all fit into 16-bits then its smaller than storing stripes - If there are fewer than 4 values remaining and they all fit into 32-bits then its smaller than storing stripes - */ - if (source_integers - (current_length - length_buffer) < 4) - { - largest = 0; - for (block = 0; block < 8; block++) - largest = max((uint8_t)largest, *(current_length + block)); - if (largest <= 8) - for (block = 0; block < 8; block++) - *(current_length + block) = 8; - else if (largest <= 16) - for (block = 0; block < 8; block++) - *(current_length + block) = 16; - else if (largest <= 32) - for (block = 0; block < 8; block++) - *(current_length + block) = 32; - } - else if (source_integers - (current_length - length_buffer) < 8) - { - largest = 0; - for (block = 0; block < 8; block++) - largest = max((uint8_t)largest, *(current_length + block)); - if (largest <= 8) - for (block = 0; block < 8; block++) - *(current_length + block) = 8; - else if (largest <= 8) - for (block = 0; block < 8; block++) - *(current_length + block) = 16; - } - else if (source_integers - (current_length - length_buffer) < 16) - { - largest = 0; - for (block = 0; block < 16; block++) - largest = max((uint8_t)largest, *(current_length + block)); - if (largest <= 8) - for (block = 0; block < 16; block++) - *(current_length + block) = 8; - } - /* - Otherwise we have the standard rules for a block - */ -#endif - switch (*current_length) - { - case 0: - for (block = 0; block < 256; block += 4) - if (*(current_length + block) > 0) - *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 1; // promote - if (*current_length == 0) - { - for (block = 0; block < 256; block++) - current_length[block] = 0; - current_length += 256; - } - break; - case 1: - for (block = 0; block < 128; block += 4) - if (*(current_length + block) > 1) - *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 2; // promote - if (*current_length == 1) - { - for (block = 0; block < 128; block++) - current_length[block] = 1; - current_length += 128; - } - break; - case 2: - for (block = 0; block < 64; block += 4) - if (*(current_length + block) > 2) - *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 3; // promote - if (*current_length == 2) - { - for (block = 0; block < 64; block++) - current_length[block] = 2; - current_length += 64; - } - break; - case 3: - for (block = 0; block < 40; block += 4) - if (*(current_length + block) > 3) - *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 4; // promote - if (*current_length == 3) - { - for (block = 0; block < 40; block++) - current_length[block] = 3; - current_length += 40; - } - break; - case 4: - for (block = 0; block < 32; block += 4) - if (*(current_length + block) > 4) - *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 5; // promote - if (*current_length == 4) - { - for (block = 0; block < 32; block++) - current_length[block] = 4; - current_length += 32; - } - break; - case 5: - for (block = 0; block < 24; block += 4) - if (*(current_length + block) > 5) - *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 6; // promote - if (*current_length == 5) - { - for (block = 0; block < 24; block++) - current_length[block] = 5; - current_length += 24; - } - break; - case 6: - for (block = 0; block < 20; block += 4) - if (*(current_length + block) > 6) - *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 7; // promote - if (*current_length == 6) - { - for (block = 0; block < 20; block++) - current_length[block] = 6; - current_length += 20; - } - break; - case 7: - for (block = 0; block < 36; block += 4) // 36 in a double 128-bit word - if (*(current_length + block) > 7) - *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 8; // promote - if (*current_length == 7) - { - for (block = 0; block < 36; block++) - current_length[block] = 7; - current_length += 36; - } - break; - case 8: - for (block = 0; block < 16; block += 4) - if (*(current_length + block) > 8) - *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 9; // promote - if (*current_length == 8) - { - for (block = 0; block < 16; block++) - current_length[block] = 8; - current_length += 16; - } - break; - case 9: - for (block = 0; block < 28; block += 4) // 28 in a double 128-bit word - if (*(current_length + block) > 9) - *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 10; // promote - if (*current_length == 9) - { - for (block = 0; block < 28; block++) - current_length[block] = 9; - current_length += 28; - } - break; - case 10: - for (block = 0; block < 12; block += 4) - if (*(current_length + block) > 10) - *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 12; // promote - if (*current_length == 10) - { - for (block = 0; block < 12; block++) - current_length[block] = 10; - current_length += 12; - } - break; - case 12: - for (block = 0; block < 20; block += 4) // 20 in a double 128-bit word - if (*(current_length + block) > 12) - *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 16; // promote - if (*current_length == 12) - { - for (block = 0; block < 20; block++) - current_length[block] = 12; - current_length += 20; - } - break; - case 16: - for (block = 0; block < 8; block += 4) - if (*(current_length + block) > 16) - *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 21; // promote - if (*current_length == 16) - { - for (block = 0; block < 8; block++) - current_length[block] = 16; - current_length += 8; - } - break; - case 21: - for (block = 0; block < 12; block += 4) // 12 in a double 128-bit word - if (*(current_length + block) > 21) - *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 32; // promote - if (*current_length == 21) - { - for (block = 0; block < 12; block++) - current_length[block] = 21; - current_length += 12; - } - break; - case 32: - for (block = 0; block < 4; block += 4) - if (*(current_length + block) > 32) - *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 64; // promote - if (*current_length == 32) - { - for (block = 0; block < 4; block++) - current_length[block] = 32; - current_length += 4; - } - break; - default: - exit(printf("Selecting on a non whole power of 2, must exit\n")); - break; - } - } - -/* - We can now compress based on the lengths in length_buffer -*/ -run_length = 1; -bits = length_buffer[0]; -keys = length_buffer; // we're going to re-use the length_buffer because it can't overlap and this saves a double malloc -for (current = (uint32_t *)source + 1; current < source + source_integers; current++) - { - new_needed = length_buffer[current - source]; - if (new_needed == bits) - run_length++; - else - { - write_out(&destination, (uint32_t *)current - run_length, run_length, bits, &keys); - bits = new_needed; - run_length = 1; - } - } -write_out(&destination, (uint32_t *)current - run_length, run_length, bits, &keys); - -/* - Copy the lengths to the end -*/ -memcpy(destination, length_buffer, keys - length_buffer); -destination += keys - length_buffer; - -/* - Add the pointer to the lengths -*/ -uint32_t val = keys - length_buffer + vbyte_bytes_needed_for(keys - length_buffer); // offset (from the end) to the start of the keys -if (vbyte_bytes_needed_for(val) > vbyte_bytes_needed_for(keys - length_buffer)) - val = keys - length_buffer + vbyte_bytes_needed_for(val); // although rare, this happens when adding the length of the vbyte encoded length makes the vbyte encoding one byte longer (i.e. 127) -vbyte_compress_into(destination, val); - -destination += vbyte_bytes_needed_for(val); - - -/* - Compute the length (in bytes) -*/ -*nvalue = destination - (uint8_t *)into; // return length in bytes -} - -#ifdef MAKE_DECOMPRESS - /* - The following program generates the source code for compress_runlength::decodeArray() - */ - /* - MAIN() - ------ - This version assumes SSE4.1 and so it is *not* portable to non X86 architectures - */ - int main(void) - { - uint32_t instance; - - - printf("static uint32_t ALIGN_16 static_mask_21[] = {0x1fffff, 0x1fffff, 0x1fffff, 0x1fffff};\n"); - printf("static uint32_t ALIGN_16 static_mask_12[] = {0xfff, 0xfff, 0xfff, 0xfff};\n"); - printf("static uint32_t ALIGN_16 static_mask_10[] = {0x3ff, 0x3ff, 0x3ff, 0x3ff};\n"); - printf("static uint32_t ALIGN_16 static_mask_9[] = {0x1ff, 0x1ff, 0x1ff, 0x1ff};\n"); - printf("static uint32_t ALIGN_16 static_mask_7[] = {0x7f, 0x7f, 0x7f, 0x7f};\n"); - printf("static uint32_t ALIGN_16 static_mask_6[] = {0x3f, 0x3f, 0x3f, 0x3f};\n"); - printf("static uint32_t ALIGN_16 static_mask_5[] = {0x1f, 0x1f, 0x1f, 0x1f};\n"); - printf("static uint32_t ALIGN_16 static_mask_4[] = {0x0f, 0x0f, 0x0f, 0x0f};\n"); - printf("static uint32_t ALIGN_16 static_mask_3[] = {0x07, 0x07, 0x07, 0x07};\n"); - printf("static uint32_t ALIGN_16 static_mask_2[] = {0x03, 0x03, 0x03, 0x03};\n"); - printf("static uint32_t ALIGN_16 static_mask_1[] = {0x01, 0x01, 0x01, 0x01};\n"); - printf("void compress_qmx::decodeArray(const uint32_t *source, uint64_t len, uint32_t *to, uint64_t destination_integers)\n"); - printf("{\n"); - printf("__m128i byte_stream, byte_stream_2, tmp, tmp2, mask_21, mask_12, mask_10, mask_9, mask_7, mask_6, mask_5, mask_4, mask_3, mask_2, mask_1;\n"); - printf("uint8_t *in = (uint8_t *)source;\n"); - printf("uint32_t *end = to + destination_integers;\n"); - printf("uint32_t key_start = vbyte_decompress((uint8_t *)source + len - 1);\n"); - printf("uint8_t *keys = (uint8_t *)source + len - key_start;\n"); - - printf("\n"); - printf("mask_21 = _mm_loadu_si128((__m128i *)static_mask_21);\n"); - printf("mask_12 = _mm_loadu_si128((__m128i *)static_mask_12);\n"); - printf("mask_10 = _mm_loadu_si128((__m128i *)static_mask_10);\n"); - printf("mask_9 = _mm_loadu_si128((__m128i *)static_mask_9);\n"); - printf("mask_7 = _mm_loadu_si128((__m128i *)static_mask_7);\n"); - printf("mask_6 = _mm_loadu_si128((__m128i *)static_mask_6);\n"); - printf("mask_5 = _mm_loadu_si128((__m128i *)static_mask_5);\n"); - printf("mask_4 = _mm_loadu_si128((__m128i *)static_mask_4);\n"); - printf("mask_3 = _mm_loadu_si128((__m128i *)static_mask_3);\n"); - printf("mask_2 = _mm_loadu_si128((__m128i *)static_mask_2);\n"); - printf("mask_1 = _mm_loadu_si128((__m128i *)static_mask_1);\n"); - printf("\n"); - - printf("while (to < end)\n"); - printf("\t{\n"); - printf("\tswitch (*keys++)\n"); - printf("\t\t{\n"); - - for (instance = 0; instance <= 0xFF; instance++) - { - printf("\t\tcase 0x%02x:\n", instance); - if ((instance >> 4) == 0) - { - /* - 256 0-bit integers - */ - printf("#ifdef NO_ZEROS\n"); - printf("\t\t\ttmp = _mm_loadu_si128((__m128i *)static_mask_1);\n"); - printf("#else\n"); - printf("\t\t\ttmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));\n"); - printf("#endif\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 5, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 6, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 7, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 8, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 9, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 10, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 11, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 12, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 13, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 14, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 15, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 16, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 17, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 18, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 19, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 20, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 21, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 22, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 23, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 24, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 25, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 26, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 27, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 28, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 29, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 30, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 31, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 32, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 33, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 34, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 35, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 36, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 37, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 38, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 39, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 40, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 41, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 42, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 43, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 44, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 45, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 46, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 47, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 48, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 49, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 50, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 51, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 52, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 53, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 54, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 55, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 56, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 57, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 58, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 59, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 60, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 61, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 62, tmp);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 63, tmp);\n"); - printf("\t\t\tto += 256;\n"); // becomes 256 integers - } - else if (instance >> 4 == 1) - { - /* - 128 * 1-bit integers - */ - printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));\n"); - - printf("\t\t\tin += 16;\n"); // 16 bytes - printf("\t\t\tto += 128;\n"); // becomes 128 integers - } - else if (instance >> 4 == 2) - { - /* - 64 * 2-bit integers - */ - printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));\n"); - - printf("\t\t\tin += 16;\n"); // 16 bytes - printf("\t\t\tto += 64;\n"); // becomes 64 integers - } - else if (instance >> 4 == 3) - { - /* - 40 * 3-bit integers - */ - printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));\n"); - - printf("\t\t\tin += 16;\n"); // 16 bytes - printf("\t\t\tto += 40;\n"); // becomes 40 integers - } - else if (instance >> 4 == 4) - { - /* - 32 * 4-bit integers - */ - printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 4);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 4);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 4);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 4);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 4);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 4);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 4);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));\n"); - - printf("\t\t\tin += 16;\n"); // 16 bytes - printf("\t\t\tto += 32;\n"); // becomes 32 integers - } - else if (instance >> 4 == 5) - { - /* - 24 * 5-bit integers - */ - printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 5);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 5);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 5);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 5);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 5);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));\n"); - - printf("\t\t\tin += 16;\n"); // 16 bytes - printf("\t\t\tto += 24;\n"); // becomes 24 integers - } - else if (instance >> 4 == 6) - { - /* - 20 * 6-bit integers - */ - printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 6);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 6);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 6);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 6);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));\n"); - - printf("\t\t\tin += 16;\n"); // 16 bytes - printf("\t\t\tto += 20;\n"); // becomes 20 integers - } - else if (instance >> 4 == 7) - { - /* - 36 * 7 bit integers (in two 128-bit words) - */ - printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 7);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 7);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 7);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));\n"); - - printf("\t\t\tbyte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream_2, 3);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 7);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 7);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 7);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));\n"); - - printf("\t\t\tin += 32;\n"); // 32 bytes - printf("\t\t\tto += 36;\n"); // becomes 36 integers - } - else if (instance >> 4 == 8) - { - /* - 16 * 8-bit integers - */ - printf("\t\t\ttmp = _mm_loadu_si128((__m128i *)in);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));\n"); - printf("\t\t\ttmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));\n"); - printf("\t\t\ttmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));\n"); - printf("\t\t\ttmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));\n"); - - printf("\t\t\tin += 16;\n"); // 16 bytes - printf("\t\t\tto += 16;\n"); // becomes 16 integers - } - else if (instance >> 4 == 9) - { - /* - 28 * 9-bit ingtegers (in two 128-bit words) - */ - printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 9);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 9);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));\n"); - printf("\t\t\tbyte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream_2, 4);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 9);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 9);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));\n"); - printf("\t\t\tin += 32;\n"); // 32 bytes - printf("\t\t\tto += 28;\n"); // becomes 28 integers - } - else if (instance >> 4 == 10) - { - /* - 12 * 10-bit integers - */ - printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 10);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 10);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));\n"); - - printf("\t\t\tin += 16;\n"); // 16 bytes - printf("\t\t\tto += 12;\n"); // becomes 12 integers - } - else if (instance >> 4 == 11) - { - /* - 20 * 12-bit ingtegers (in two 128-bit words) - */ - printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 12);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));\n"); - printf("\t\t\tbyte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream_2, 8);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));\n"); - printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 12);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));\n"); - - printf("\t\t\tin += 32;\n"); // 32 bytes - printf("\t\t\tto += 20;\n"); // becomes 20 integers - } - else if (instance >> 4 == 12) - { - /* - 16-bit integers - */ - printf("\t\t\ttmp = _mm_loadu_si128((__m128i *)in);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));\n"); - - printf("\t\t\tin += 16;\n"); // 16 bytes - printf("\t\t\tto += 8;\n"); // becomes 8 integers - } - else if (instance >> 4 == 13) - { - /* - 12 * 21-bit ingtegers (in two 128-bit words) - */ - printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));"); - printf("\t\t\tbyte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));"); - - printf("\t\t\tin += 32;\n"); // 32 bytes - printf("\t\t\tto += 12;\n"); // becomes 8 integers - } - else if (instance >> 4 == 14) - { - /* - 32-bit integers - */ - printf("\t\t\ttmp = _mm_loadu_si128((__m128i *)in);\n"); - printf("\t\t\t_mm_storeu_si128((__m128i *)to, tmp);\n"); - - printf("\t\t\tin += 16;\n"); // 16 bytes - printf("\t\t\tto += 4;\n"); // becomes 4 integers - } - else - { - printf("\t\t\tin++;\n"); // dummy, can't occur - } - if ((instance & 0xF) == 0xF) - printf("\t\t\tbreak;\n"); // every 32 instances we break (its the end of the fall through) - } - printf("\t\t}\n"); - printf("\t}\n"); - printf("}\n"); - } -#endif - -#ifdef TEST_ONE_STRING - static uint32_t sequence[]={0x333,0xC7,0x21C,0x78F,0x66A,0x787,0xD0C,0xEE,0x416,0x2F8,0x410,0xFF3,0x7A7,0x35C,0x5A8,0x4ED,0x3AD,0x121,0x3A7,0x5EC,0x53,0x50C,0xFD6,0x697,0xF4,0x894,0xB5F,0x381,0x10C,0xB1E,0x2E4,0x32,0x7EB,0x1C6,0x1DB,0xE3,0x27,0x920,0x262,0x718,0x95,0x7C0,0x155,0x8F,0x83A,0x1178,0xCEF,0x7DC,0x3CB,0x30E,0x2EA,0x16F,0x212,0x4A,0x9F0,0x233,0x7,0x9F7,0x1EE,0x91,0x12FD,0x7C,0x291,0x203,0x2F8,0x39B,0x411,0x61C,0x3E2,0x1DF,0xCD7,0x5DA,0xD35,0x21,0x1C8D,0x25,0x313,0x314,0xBBB,0xFB,0x1E2,0x60,0x3F5,0x513,0x3AC,0x769,0x45E,0x485,0x1BA,0x17B,0x2DC,0x173,0x151,0x163E,0x101,0xE9D,0xB67,0x28B,0x4CA,0x955,0x6B3,0x112,0x225,0x742,0x432,0x453,0x3CF,0x541,0xCCE,0xDB6,0x406,0x58,0x202,0x647,0x9F,0x29,0x153,0x51E,0x233,0x7A3,0x731,0x3A,0xA0,0xD23,0x3C7,0xD1,0x5C,0xB90,0x22C,0xE8,0x78B,0x5E3}; - - static uint32_t second_compress_buffer[100000]; - static uint32_t second_decompress_buffer[100000]; - - uint32_t second_compress_buffer_size = sizeof(second_compress_buffer) / sizeof(*second_compress_buffer); - uint32_t second_decompress_buffer_size = sizeof(second_decompress_buffer) / sizeof(*second_decompress_buffer); - - /* - CHECK() - ------- - */ - void check(uint32_t *sequence, uint32_t sequence_length) - { - compress_qmx compressor; - uint64_t buffer_size; - uint32_t pos; - uint32_t fail; - - memset(second_compress_buffer, 0, second_compress_buffer_size); - memset(second_decompress_buffer, 0, second_decompress_buffer_size); - - compressor.encodeArray(sequence, sequence_length, (uint32_t *)second_compress_buffer, &buffer_size); - second_compress_buffer[buffer_size] = 0; - second_compress_buffer[buffer_size + 1] = 0; - second_compress_buffer[buffer_size + 2] = 0; - second_compress_buffer[buffer_size + 3] = 0; - - for (pos = 0; pos < buffer_size; pos++) - printf("%02X ", ((uint8_t *)second_compress_buffer)[pos]); - puts(""); - - compressor.decodeArray((uint32_t *)second_compress_buffer, buffer_size, (uint32_t *)second_decompress_buffer, sequence_length); - - fail = false; - for (pos = 0; pos < sequence_length; pos++) - if (sequence[pos] != second_decompress_buffer[pos]) - { - printf("p[%d]:%X != %X\n", pos, sequence[pos], second_decompress_buffer[pos]); - fail = true; - } - else - printf("p[%d]:%X == %X\n", pos, sequence[pos], second_decompress_buffer[pos]); - - if (fail) - puts("Test failed"); - else - puts("Test succeeded"); - } - - /* - MAIN() - ------ - */ - int main(void) - { - check(sequence, sizeof(sequence) / sizeof(*sequence)); - } -#endif -/* - COMPRESS_QMX::DECODEARRAY() - --------------------------- - this code was generated by the method above. -*/ -static uint32_t ALIGN_16 static_mask_21[] = {0x1fffff, 0x1fffff, 0x1fffff, 0x1fffff}; -static uint32_t ALIGN_16 static_mask_12[] = {0xfff, 0xfff, 0xfff, 0xfff}; -static uint32_t ALIGN_16 static_mask_10[] = {0x3ff, 0x3ff, 0x3ff, 0x3ff}; -static uint32_t ALIGN_16 static_mask_9[] = {0x1ff, 0x1ff, 0x1ff, 0x1ff}; -static uint32_t ALIGN_16 static_mask_7[] = {0x7f, 0x7f, 0x7f, 0x7f}; -static uint32_t ALIGN_16 static_mask_6[] = {0x3f, 0x3f, 0x3f, 0x3f}; -static uint32_t ALIGN_16 static_mask_5[] = {0x1f, 0x1f, 0x1f, 0x1f}; -static uint32_t ALIGN_16 static_mask_4[] = {0x0f, 0x0f, 0x0f, 0x0f}; -static uint32_t ALIGN_16 static_mask_3[] = {0x07, 0x07, 0x07, 0x07}; -static uint32_t ALIGN_16 static_mask_2[] = {0x03, 0x03, 0x03, 0x03}; -static uint32_t ALIGN_16 static_mask_1[] = {0x01, 0x01, 0x01, 0x01}; -void compress_qmx::decodeArray(const uint32_t *source, uint64_t len, uint32_t *to, uint64_t destination_integers) -{ -__m128i byte_stream, byte_stream_2, tmp, tmp2, mask_21, mask_12, mask_10, mask_9, mask_7, mask_6, mask_5, mask_4, mask_3, mask_2, mask_1; -uint8_t *in = (uint8_t *)source; -uint32_t *end = to + destination_integers; -uint32_t key_start = vbyte_decompress((uint8_t *)source + len - 1); -uint8_t *keys = (uint8_t *)source + len - key_start; - -mask_21 = _mm_loadu_si128((__m128i *)static_mask_21); -mask_12 = _mm_loadu_si128((__m128i *)static_mask_12); -mask_10 = _mm_loadu_si128((__m128i *)static_mask_10); -mask_9 = _mm_loadu_si128((__m128i *)static_mask_9); -mask_7 = _mm_loadu_si128((__m128i *)static_mask_7); -mask_6 = _mm_loadu_si128((__m128i *)static_mask_6); -mask_5 = _mm_loadu_si128((__m128i *)static_mask_5); -mask_4 = _mm_loadu_si128((__m128i *)static_mask_4); -mask_3 = _mm_loadu_si128((__m128i *)static_mask_3); -mask_2 = _mm_loadu_si128((__m128i *)static_mask_2); -mask_1 = _mm_loadu_si128((__m128i *)static_mask_1); - -while (to < end) - { - switch (*keys++) - { - case 0x00: -#ifdef NO_ZEROS - tmp = _mm_loadu_si128((__m128i *)static_mask_1); -#else - tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); -#endif - _mm_storeu_si128((__m128i *)to, tmp); - _mm_storeu_si128((__m128i *)to + 1, tmp); - _mm_storeu_si128((__m128i *)to + 2, tmp); - _mm_storeu_si128((__m128i *)to + 3, tmp); - _mm_storeu_si128((__m128i *)to + 4, tmp); - _mm_storeu_si128((__m128i *)to + 5, tmp); - _mm_storeu_si128((__m128i *)to + 6, tmp); - _mm_storeu_si128((__m128i *)to + 7, tmp); - _mm_storeu_si128((__m128i *)to + 8, tmp); - _mm_storeu_si128((__m128i *)to + 9, tmp); - _mm_storeu_si128((__m128i *)to + 10, tmp); - _mm_storeu_si128((__m128i *)to + 11, tmp); - _mm_storeu_si128((__m128i *)to + 12, tmp); - _mm_storeu_si128((__m128i *)to + 13, tmp); - _mm_storeu_si128((__m128i *)to + 14, tmp); - _mm_storeu_si128((__m128i *)to + 15, tmp); - _mm_storeu_si128((__m128i *)to + 16, tmp); - _mm_storeu_si128((__m128i *)to + 17, tmp); - _mm_storeu_si128((__m128i *)to + 18, tmp); - _mm_storeu_si128((__m128i *)to + 19, tmp); - _mm_storeu_si128((__m128i *)to + 20, tmp); - _mm_storeu_si128((__m128i *)to + 21, tmp); - _mm_storeu_si128((__m128i *)to + 22, tmp); - _mm_storeu_si128((__m128i *)to + 23, tmp); - _mm_storeu_si128((__m128i *)to + 24, tmp); - _mm_storeu_si128((__m128i *)to + 25, tmp); - _mm_storeu_si128((__m128i *)to + 26, tmp); - _mm_storeu_si128((__m128i *)to + 27, tmp); - _mm_storeu_si128((__m128i *)to + 28, tmp); - _mm_storeu_si128((__m128i *)to + 29, tmp); - _mm_storeu_si128((__m128i *)to + 30, tmp); - _mm_storeu_si128((__m128i *)to + 31, tmp); - _mm_storeu_si128((__m128i *)to + 32, tmp); - _mm_storeu_si128((__m128i *)to + 33, tmp); - _mm_storeu_si128((__m128i *)to + 34, tmp); - _mm_storeu_si128((__m128i *)to + 35, tmp); - _mm_storeu_si128((__m128i *)to + 36, tmp); - _mm_storeu_si128((__m128i *)to + 37, tmp); - _mm_storeu_si128((__m128i *)to + 38, tmp); - _mm_storeu_si128((__m128i *)to + 39, tmp); - _mm_storeu_si128((__m128i *)to + 40, tmp); - _mm_storeu_si128((__m128i *)to + 41, tmp); - _mm_storeu_si128((__m128i *)to + 42, tmp); - _mm_storeu_si128((__m128i *)to + 43, tmp); - _mm_storeu_si128((__m128i *)to + 44, tmp); - _mm_storeu_si128((__m128i *)to + 45, tmp); - _mm_storeu_si128((__m128i *)to + 46, tmp); - _mm_storeu_si128((__m128i *)to + 47, tmp); - _mm_storeu_si128((__m128i *)to + 48, tmp); - _mm_storeu_si128((__m128i *)to + 49, tmp); - _mm_storeu_si128((__m128i *)to + 50, tmp); - _mm_storeu_si128((__m128i *)to + 51, tmp); - _mm_storeu_si128((__m128i *)to + 52, tmp); - _mm_storeu_si128((__m128i *)to + 53, tmp); - _mm_storeu_si128((__m128i *)to + 54, tmp); - _mm_storeu_si128((__m128i *)to + 55, tmp); - _mm_storeu_si128((__m128i *)to + 56, tmp); - _mm_storeu_si128((__m128i *)to + 57, tmp); - _mm_storeu_si128((__m128i *)to + 58, tmp); - _mm_storeu_si128((__m128i *)to + 59, tmp); - _mm_storeu_si128((__m128i *)to + 60, tmp); - _mm_storeu_si128((__m128i *)to + 61, tmp); - _mm_storeu_si128((__m128i *)to + 62, tmp); - _mm_storeu_si128((__m128i *)to + 63, tmp); - to += 256; - case 0x01: -#ifdef NO_ZEROS - tmp = _mm_loadu_si128((__m128i *)static_mask_1); -#else - tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); -#endif - _mm_storeu_si128((__m128i *)to, tmp); - _mm_storeu_si128((__m128i *)to + 1, tmp); - _mm_storeu_si128((__m128i *)to + 2, tmp); - _mm_storeu_si128((__m128i *)to + 3, tmp); - _mm_storeu_si128((__m128i *)to + 4, tmp); - _mm_storeu_si128((__m128i *)to + 5, tmp); - _mm_storeu_si128((__m128i *)to + 6, tmp); - _mm_storeu_si128((__m128i *)to + 7, tmp); - _mm_storeu_si128((__m128i *)to + 8, tmp); - _mm_storeu_si128((__m128i *)to + 9, tmp); - _mm_storeu_si128((__m128i *)to + 10, tmp); - _mm_storeu_si128((__m128i *)to + 11, tmp); - _mm_storeu_si128((__m128i *)to + 12, tmp); - _mm_storeu_si128((__m128i *)to + 13, tmp); - _mm_storeu_si128((__m128i *)to + 14, tmp); - _mm_storeu_si128((__m128i *)to + 15, tmp); - _mm_storeu_si128((__m128i *)to + 16, tmp); - _mm_storeu_si128((__m128i *)to + 17, tmp); - _mm_storeu_si128((__m128i *)to + 18, tmp); - _mm_storeu_si128((__m128i *)to + 19, tmp); - _mm_storeu_si128((__m128i *)to + 20, tmp); - _mm_storeu_si128((__m128i *)to + 21, tmp); - _mm_storeu_si128((__m128i *)to + 22, tmp); - _mm_storeu_si128((__m128i *)to + 23, tmp); - _mm_storeu_si128((__m128i *)to + 24, tmp); - _mm_storeu_si128((__m128i *)to + 25, tmp); - _mm_storeu_si128((__m128i *)to + 26, tmp); - _mm_storeu_si128((__m128i *)to + 27, tmp); - _mm_storeu_si128((__m128i *)to + 28, tmp); - _mm_storeu_si128((__m128i *)to + 29, tmp); - _mm_storeu_si128((__m128i *)to + 30, tmp); - _mm_storeu_si128((__m128i *)to + 31, tmp); - _mm_storeu_si128((__m128i *)to + 32, tmp); - _mm_storeu_si128((__m128i *)to + 33, tmp); - _mm_storeu_si128((__m128i *)to + 34, tmp); - _mm_storeu_si128((__m128i *)to + 35, tmp); - _mm_storeu_si128((__m128i *)to + 36, tmp); - _mm_storeu_si128((__m128i *)to + 37, tmp); - _mm_storeu_si128((__m128i *)to + 38, tmp); - _mm_storeu_si128((__m128i *)to + 39, tmp); - _mm_storeu_si128((__m128i *)to + 40, tmp); - _mm_storeu_si128((__m128i *)to + 41, tmp); - _mm_storeu_si128((__m128i *)to + 42, tmp); - _mm_storeu_si128((__m128i *)to + 43, tmp); - _mm_storeu_si128((__m128i *)to + 44, tmp); - _mm_storeu_si128((__m128i *)to + 45, tmp); - _mm_storeu_si128((__m128i *)to + 46, tmp); - _mm_storeu_si128((__m128i *)to + 47, tmp); - _mm_storeu_si128((__m128i *)to + 48, tmp); - _mm_storeu_si128((__m128i *)to + 49, tmp); - _mm_storeu_si128((__m128i *)to + 50, tmp); - _mm_storeu_si128((__m128i *)to + 51, tmp); - _mm_storeu_si128((__m128i *)to + 52, tmp); - _mm_storeu_si128((__m128i *)to + 53, tmp); - _mm_storeu_si128((__m128i *)to + 54, tmp); - _mm_storeu_si128((__m128i *)to + 55, tmp); - _mm_storeu_si128((__m128i *)to + 56, tmp); - _mm_storeu_si128((__m128i *)to + 57, tmp); - _mm_storeu_si128((__m128i *)to + 58, tmp); - _mm_storeu_si128((__m128i *)to + 59, tmp); - _mm_storeu_si128((__m128i *)to + 60, tmp); - _mm_storeu_si128((__m128i *)to + 61, tmp); - _mm_storeu_si128((__m128i *)to + 62, tmp); - _mm_storeu_si128((__m128i *)to + 63, tmp); - to += 256; - case 0x02: -#ifdef NO_ZEROS - tmp = _mm_loadu_si128((__m128i *)static_mask_1); -#else - tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); -#endif - _mm_storeu_si128((__m128i *)to, tmp); - _mm_storeu_si128((__m128i *)to + 1, tmp); - _mm_storeu_si128((__m128i *)to + 2, tmp); - _mm_storeu_si128((__m128i *)to + 3, tmp); - _mm_storeu_si128((__m128i *)to + 4, tmp); - _mm_storeu_si128((__m128i *)to + 5, tmp); - _mm_storeu_si128((__m128i *)to + 6, tmp); - _mm_storeu_si128((__m128i *)to + 7, tmp); - _mm_storeu_si128((__m128i *)to + 8, tmp); - _mm_storeu_si128((__m128i *)to + 9, tmp); - _mm_storeu_si128((__m128i *)to + 10, tmp); - _mm_storeu_si128((__m128i *)to + 11, tmp); - _mm_storeu_si128((__m128i *)to + 12, tmp); - _mm_storeu_si128((__m128i *)to + 13, tmp); - _mm_storeu_si128((__m128i *)to + 14, tmp); - _mm_storeu_si128((__m128i *)to + 15, tmp); - _mm_storeu_si128((__m128i *)to + 16, tmp); - _mm_storeu_si128((__m128i *)to + 17, tmp); - _mm_storeu_si128((__m128i *)to + 18, tmp); - _mm_storeu_si128((__m128i *)to + 19, tmp); - _mm_storeu_si128((__m128i *)to + 20, tmp); - _mm_storeu_si128((__m128i *)to + 21, tmp); - _mm_storeu_si128((__m128i *)to + 22, tmp); - _mm_storeu_si128((__m128i *)to + 23, tmp); - _mm_storeu_si128((__m128i *)to + 24, tmp); - _mm_storeu_si128((__m128i *)to + 25, tmp); - _mm_storeu_si128((__m128i *)to + 26, tmp); - _mm_storeu_si128((__m128i *)to + 27, tmp); - _mm_storeu_si128((__m128i *)to + 28, tmp); - _mm_storeu_si128((__m128i *)to + 29, tmp); - _mm_storeu_si128((__m128i *)to + 30, tmp); - _mm_storeu_si128((__m128i *)to + 31, tmp); - _mm_storeu_si128((__m128i *)to + 32, tmp); - _mm_storeu_si128((__m128i *)to + 33, tmp); - _mm_storeu_si128((__m128i *)to + 34, tmp); - _mm_storeu_si128((__m128i *)to + 35, tmp); - _mm_storeu_si128((__m128i *)to + 36, tmp); - _mm_storeu_si128((__m128i *)to + 37, tmp); - _mm_storeu_si128((__m128i *)to + 38, tmp); - _mm_storeu_si128((__m128i *)to + 39, tmp); - _mm_storeu_si128((__m128i *)to + 40, tmp); - _mm_storeu_si128((__m128i *)to + 41, tmp); - _mm_storeu_si128((__m128i *)to + 42, tmp); - _mm_storeu_si128((__m128i *)to + 43, tmp); - _mm_storeu_si128((__m128i *)to + 44, tmp); - _mm_storeu_si128((__m128i *)to + 45, tmp); - _mm_storeu_si128((__m128i *)to + 46, tmp); - _mm_storeu_si128((__m128i *)to + 47, tmp); - _mm_storeu_si128((__m128i *)to + 48, tmp); - _mm_storeu_si128((__m128i *)to + 49, tmp); - _mm_storeu_si128((__m128i *)to + 50, tmp); - _mm_storeu_si128((__m128i *)to + 51, tmp); - _mm_storeu_si128((__m128i *)to + 52, tmp); - _mm_storeu_si128((__m128i *)to + 53, tmp); - _mm_storeu_si128((__m128i *)to + 54, tmp); - _mm_storeu_si128((__m128i *)to + 55, tmp); - _mm_storeu_si128((__m128i *)to + 56, tmp); - _mm_storeu_si128((__m128i *)to + 57, tmp); - _mm_storeu_si128((__m128i *)to + 58, tmp); - _mm_storeu_si128((__m128i *)to + 59, tmp); - _mm_storeu_si128((__m128i *)to + 60, tmp); - _mm_storeu_si128((__m128i *)to + 61, tmp); - _mm_storeu_si128((__m128i *)to + 62, tmp); - _mm_storeu_si128((__m128i *)to + 63, tmp); - to += 256; - case 0x03: -#ifdef NO_ZEROS - tmp = _mm_loadu_si128((__m128i *)static_mask_1); -#else - tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); -#endif - _mm_storeu_si128((__m128i *)to, tmp); - _mm_storeu_si128((__m128i *)to + 1, tmp); - _mm_storeu_si128((__m128i *)to + 2, tmp); - _mm_storeu_si128((__m128i *)to + 3, tmp); - _mm_storeu_si128((__m128i *)to + 4, tmp); - _mm_storeu_si128((__m128i *)to + 5, tmp); - _mm_storeu_si128((__m128i *)to + 6, tmp); - _mm_storeu_si128((__m128i *)to + 7, tmp); - _mm_storeu_si128((__m128i *)to + 8, tmp); - _mm_storeu_si128((__m128i *)to + 9, tmp); - _mm_storeu_si128((__m128i *)to + 10, tmp); - _mm_storeu_si128((__m128i *)to + 11, tmp); - _mm_storeu_si128((__m128i *)to + 12, tmp); - _mm_storeu_si128((__m128i *)to + 13, tmp); - _mm_storeu_si128((__m128i *)to + 14, tmp); - _mm_storeu_si128((__m128i *)to + 15, tmp); - _mm_storeu_si128((__m128i *)to + 16, tmp); - _mm_storeu_si128((__m128i *)to + 17, tmp); - _mm_storeu_si128((__m128i *)to + 18, tmp); - _mm_storeu_si128((__m128i *)to + 19, tmp); - _mm_storeu_si128((__m128i *)to + 20, tmp); - _mm_storeu_si128((__m128i *)to + 21, tmp); - _mm_storeu_si128((__m128i *)to + 22, tmp); - _mm_storeu_si128((__m128i *)to + 23, tmp); - _mm_storeu_si128((__m128i *)to + 24, tmp); - _mm_storeu_si128((__m128i *)to + 25, tmp); - _mm_storeu_si128((__m128i *)to + 26, tmp); - _mm_storeu_si128((__m128i *)to + 27, tmp); - _mm_storeu_si128((__m128i *)to + 28, tmp); - _mm_storeu_si128((__m128i *)to + 29, tmp); - _mm_storeu_si128((__m128i *)to + 30, tmp); - _mm_storeu_si128((__m128i *)to + 31, tmp); - _mm_storeu_si128((__m128i *)to + 32, tmp); - _mm_storeu_si128((__m128i *)to + 33, tmp); - _mm_storeu_si128((__m128i *)to + 34, tmp); - _mm_storeu_si128((__m128i *)to + 35, tmp); - _mm_storeu_si128((__m128i *)to + 36, tmp); - _mm_storeu_si128((__m128i *)to + 37, tmp); - _mm_storeu_si128((__m128i *)to + 38, tmp); - _mm_storeu_si128((__m128i *)to + 39, tmp); - _mm_storeu_si128((__m128i *)to + 40, tmp); - _mm_storeu_si128((__m128i *)to + 41, tmp); - _mm_storeu_si128((__m128i *)to + 42, tmp); - _mm_storeu_si128((__m128i *)to + 43, tmp); - _mm_storeu_si128((__m128i *)to + 44, tmp); - _mm_storeu_si128((__m128i *)to + 45, tmp); - _mm_storeu_si128((__m128i *)to + 46, tmp); - _mm_storeu_si128((__m128i *)to + 47, tmp); - _mm_storeu_si128((__m128i *)to + 48, tmp); - _mm_storeu_si128((__m128i *)to + 49, tmp); - _mm_storeu_si128((__m128i *)to + 50, tmp); - _mm_storeu_si128((__m128i *)to + 51, tmp); - _mm_storeu_si128((__m128i *)to + 52, tmp); - _mm_storeu_si128((__m128i *)to + 53, tmp); - _mm_storeu_si128((__m128i *)to + 54, tmp); - _mm_storeu_si128((__m128i *)to + 55, tmp); - _mm_storeu_si128((__m128i *)to + 56, tmp); - _mm_storeu_si128((__m128i *)to + 57, tmp); - _mm_storeu_si128((__m128i *)to + 58, tmp); - _mm_storeu_si128((__m128i *)to + 59, tmp); - _mm_storeu_si128((__m128i *)to + 60, tmp); - _mm_storeu_si128((__m128i *)to + 61, tmp); - _mm_storeu_si128((__m128i *)to + 62, tmp); - _mm_storeu_si128((__m128i *)to + 63, tmp); - to += 256; - case 0x04: -#ifdef NO_ZEROS - tmp = _mm_loadu_si128((__m128i *)static_mask_1); -#else - tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); -#endif - _mm_storeu_si128((__m128i *)to, tmp); - _mm_storeu_si128((__m128i *)to + 1, tmp); - _mm_storeu_si128((__m128i *)to + 2, tmp); - _mm_storeu_si128((__m128i *)to + 3, tmp); - _mm_storeu_si128((__m128i *)to + 4, tmp); - _mm_storeu_si128((__m128i *)to + 5, tmp); - _mm_storeu_si128((__m128i *)to + 6, tmp); - _mm_storeu_si128((__m128i *)to + 7, tmp); - _mm_storeu_si128((__m128i *)to + 8, tmp); - _mm_storeu_si128((__m128i *)to + 9, tmp); - _mm_storeu_si128((__m128i *)to + 10, tmp); - _mm_storeu_si128((__m128i *)to + 11, tmp); - _mm_storeu_si128((__m128i *)to + 12, tmp); - _mm_storeu_si128((__m128i *)to + 13, tmp); - _mm_storeu_si128((__m128i *)to + 14, tmp); - _mm_storeu_si128((__m128i *)to + 15, tmp); - _mm_storeu_si128((__m128i *)to + 16, tmp); - _mm_storeu_si128((__m128i *)to + 17, tmp); - _mm_storeu_si128((__m128i *)to + 18, tmp); - _mm_storeu_si128((__m128i *)to + 19, tmp); - _mm_storeu_si128((__m128i *)to + 20, tmp); - _mm_storeu_si128((__m128i *)to + 21, tmp); - _mm_storeu_si128((__m128i *)to + 22, tmp); - _mm_storeu_si128((__m128i *)to + 23, tmp); - _mm_storeu_si128((__m128i *)to + 24, tmp); - _mm_storeu_si128((__m128i *)to + 25, tmp); - _mm_storeu_si128((__m128i *)to + 26, tmp); - _mm_storeu_si128((__m128i *)to + 27, tmp); - _mm_storeu_si128((__m128i *)to + 28, tmp); - _mm_storeu_si128((__m128i *)to + 29, tmp); - _mm_storeu_si128((__m128i *)to + 30, tmp); - _mm_storeu_si128((__m128i *)to + 31, tmp); - _mm_storeu_si128((__m128i *)to + 32, tmp); - _mm_storeu_si128((__m128i *)to + 33, tmp); - _mm_storeu_si128((__m128i *)to + 34, tmp); - _mm_storeu_si128((__m128i *)to + 35, tmp); - _mm_storeu_si128((__m128i *)to + 36, tmp); - _mm_storeu_si128((__m128i *)to + 37, tmp); - _mm_storeu_si128((__m128i *)to + 38, tmp); - _mm_storeu_si128((__m128i *)to + 39, tmp); - _mm_storeu_si128((__m128i *)to + 40, tmp); - _mm_storeu_si128((__m128i *)to + 41, tmp); - _mm_storeu_si128((__m128i *)to + 42, tmp); - _mm_storeu_si128((__m128i *)to + 43, tmp); - _mm_storeu_si128((__m128i *)to + 44, tmp); - _mm_storeu_si128((__m128i *)to + 45, tmp); - _mm_storeu_si128((__m128i *)to + 46, tmp); - _mm_storeu_si128((__m128i *)to + 47, tmp); - _mm_storeu_si128((__m128i *)to + 48, tmp); - _mm_storeu_si128((__m128i *)to + 49, tmp); - _mm_storeu_si128((__m128i *)to + 50, tmp); - _mm_storeu_si128((__m128i *)to + 51, tmp); - _mm_storeu_si128((__m128i *)to + 52, tmp); - _mm_storeu_si128((__m128i *)to + 53, tmp); - _mm_storeu_si128((__m128i *)to + 54, tmp); - _mm_storeu_si128((__m128i *)to + 55, tmp); - _mm_storeu_si128((__m128i *)to + 56, tmp); - _mm_storeu_si128((__m128i *)to + 57, tmp); - _mm_storeu_si128((__m128i *)to + 58, tmp); - _mm_storeu_si128((__m128i *)to + 59, tmp); - _mm_storeu_si128((__m128i *)to + 60, tmp); - _mm_storeu_si128((__m128i *)to + 61, tmp); - _mm_storeu_si128((__m128i *)to + 62, tmp); - _mm_storeu_si128((__m128i *)to + 63, tmp); - to += 256; - case 0x05: -#ifdef NO_ZEROS - tmp = _mm_loadu_si128((__m128i *)static_mask_1); -#else - tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); -#endif - _mm_storeu_si128((__m128i *)to, tmp); - _mm_storeu_si128((__m128i *)to + 1, tmp); - _mm_storeu_si128((__m128i *)to + 2, tmp); - _mm_storeu_si128((__m128i *)to + 3, tmp); - _mm_storeu_si128((__m128i *)to + 4, tmp); - _mm_storeu_si128((__m128i *)to + 5, tmp); - _mm_storeu_si128((__m128i *)to + 6, tmp); - _mm_storeu_si128((__m128i *)to + 7, tmp); - _mm_storeu_si128((__m128i *)to + 8, tmp); - _mm_storeu_si128((__m128i *)to + 9, tmp); - _mm_storeu_si128((__m128i *)to + 10, tmp); - _mm_storeu_si128((__m128i *)to + 11, tmp); - _mm_storeu_si128((__m128i *)to + 12, tmp); - _mm_storeu_si128((__m128i *)to + 13, tmp); - _mm_storeu_si128((__m128i *)to + 14, tmp); - _mm_storeu_si128((__m128i *)to + 15, tmp); - _mm_storeu_si128((__m128i *)to + 16, tmp); - _mm_storeu_si128((__m128i *)to + 17, tmp); - _mm_storeu_si128((__m128i *)to + 18, tmp); - _mm_storeu_si128((__m128i *)to + 19, tmp); - _mm_storeu_si128((__m128i *)to + 20, tmp); - _mm_storeu_si128((__m128i *)to + 21, tmp); - _mm_storeu_si128((__m128i *)to + 22, tmp); - _mm_storeu_si128((__m128i *)to + 23, tmp); - _mm_storeu_si128((__m128i *)to + 24, tmp); - _mm_storeu_si128((__m128i *)to + 25, tmp); - _mm_storeu_si128((__m128i *)to + 26, tmp); - _mm_storeu_si128((__m128i *)to + 27, tmp); - _mm_storeu_si128((__m128i *)to + 28, tmp); - _mm_storeu_si128((__m128i *)to + 29, tmp); - _mm_storeu_si128((__m128i *)to + 30, tmp); - _mm_storeu_si128((__m128i *)to + 31, tmp); - _mm_storeu_si128((__m128i *)to + 32, tmp); - _mm_storeu_si128((__m128i *)to + 33, tmp); - _mm_storeu_si128((__m128i *)to + 34, tmp); - _mm_storeu_si128((__m128i *)to + 35, tmp); - _mm_storeu_si128((__m128i *)to + 36, tmp); - _mm_storeu_si128((__m128i *)to + 37, tmp); - _mm_storeu_si128((__m128i *)to + 38, tmp); - _mm_storeu_si128((__m128i *)to + 39, tmp); - _mm_storeu_si128((__m128i *)to + 40, tmp); - _mm_storeu_si128((__m128i *)to + 41, tmp); - _mm_storeu_si128((__m128i *)to + 42, tmp); - _mm_storeu_si128((__m128i *)to + 43, tmp); - _mm_storeu_si128((__m128i *)to + 44, tmp); - _mm_storeu_si128((__m128i *)to + 45, tmp); - _mm_storeu_si128((__m128i *)to + 46, tmp); - _mm_storeu_si128((__m128i *)to + 47, tmp); - _mm_storeu_si128((__m128i *)to + 48, tmp); - _mm_storeu_si128((__m128i *)to + 49, tmp); - _mm_storeu_si128((__m128i *)to + 50, tmp); - _mm_storeu_si128((__m128i *)to + 51, tmp); - _mm_storeu_si128((__m128i *)to + 52, tmp); - _mm_storeu_si128((__m128i *)to + 53, tmp); - _mm_storeu_si128((__m128i *)to + 54, tmp); - _mm_storeu_si128((__m128i *)to + 55, tmp); - _mm_storeu_si128((__m128i *)to + 56, tmp); - _mm_storeu_si128((__m128i *)to + 57, tmp); - _mm_storeu_si128((__m128i *)to + 58, tmp); - _mm_storeu_si128((__m128i *)to + 59, tmp); - _mm_storeu_si128((__m128i *)to + 60, tmp); - _mm_storeu_si128((__m128i *)to + 61, tmp); - _mm_storeu_si128((__m128i *)to + 62, tmp); - _mm_storeu_si128((__m128i *)to + 63, tmp); - to += 256; - case 0x06: -#ifdef NO_ZEROS - tmp = _mm_loadu_si128((__m128i *)static_mask_1); -#else - tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); -#endif - _mm_storeu_si128((__m128i *)to, tmp); - _mm_storeu_si128((__m128i *)to + 1, tmp); - _mm_storeu_si128((__m128i *)to + 2, tmp); - _mm_storeu_si128((__m128i *)to + 3, tmp); - _mm_storeu_si128((__m128i *)to + 4, tmp); - _mm_storeu_si128((__m128i *)to + 5, tmp); - _mm_storeu_si128((__m128i *)to + 6, tmp); - _mm_storeu_si128((__m128i *)to + 7, tmp); - _mm_storeu_si128((__m128i *)to + 8, tmp); - _mm_storeu_si128((__m128i *)to + 9, tmp); - _mm_storeu_si128((__m128i *)to + 10, tmp); - _mm_storeu_si128((__m128i *)to + 11, tmp); - _mm_storeu_si128((__m128i *)to + 12, tmp); - _mm_storeu_si128((__m128i *)to + 13, tmp); - _mm_storeu_si128((__m128i *)to + 14, tmp); - _mm_storeu_si128((__m128i *)to + 15, tmp); - _mm_storeu_si128((__m128i *)to + 16, tmp); - _mm_storeu_si128((__m128i *)to + 17, tmp); - _mm_storeu_si128((__m128i *)to + 18, tmp); - _mm_storeu_si128((__m128i *)to + 19, tmp); - _mm_storeu_si128((__m128i *)to + 20, tmp); - _mm_storeu_si128((__m128i *)to + 21, tmp); - _mm_storeu_si128((__m128i *)to + 22, tmp); - _mm_storeu_si128((__m128i *)to + 23, tmp); - _mm_storeu_si128((__m128i *)to + 24, tmp); - _mm_storeu_si128((__m128i *)to + 25, tmp); - _mm_storeu_si128((__m128i *)to + 26, tmp); - _mm_storeu_si128((__m128i *)to + 27, tmp); - _mm_storeu_si128((__m128i *)to + 28, tmp); - _mm_storeu_si128((__m128i *)to + 29, tmp); - _mm_storeu_si128((__m128i *)to + 30, tmp); - _mm_storeu_si128((__m128i *)to + 31, tmp); - _mm_storeu_si128((__m128i *)to + 32, tmp); - _mm_storeu_si128((__m128i *)to + 33, tmp); - _mm_storeu_si128((__m128i *)to + 34, tmp); - _mm_storeu_si128((__m128i *)to + 35, tmp); - _mm_storeu_si128((__m128i *)to + 36, tmp); - _mm_storeu_si128((__m128i *)to + 37, tmp); - _mm_storeu_si128((__m128i *)to + 38, tmp); - _mm_storeu_si128((__m128i *)to + 39, tmp); - _mm_storeu_si128((__m128i *)to + 40, tmp); - _mm_storeu_si128((__m128i *)to + 41, tmp); - _mm_storeu_si128((__m128i *)to + 42, tmp); - _mm_storeu_si128((__m128i *)to + 43, tmp); - _mm_storeu_si128((__m128i *)to + 44, tmp); - _mm_storeu_si128((__m128i *)to + 45, tmp); - _mm_storeu_si128((__m128i *)to + 46, tmp); - _mm_storeu_si128((__m128i *)to + 47, tmp); - _mm_storeu_si128((__m128i *)to + 48, tmp); - _mm_storeu_si128((__m128i *)to + 49, tmp); - _mm_storeu_si128((__m128i *)to + 50, tmp); - _mm_storeu_si128((__m128i *)to + 51, tmp); - _mm_storeu_si128((__m128i *)to + 52, tmp); - _mm_storeu_si128((__m128i *)to + 53, tmp); - _mm_storeu_si128((__m128i *)to + 54, tmp); - _mm_storeu_si128((__m128i *)to + 55, tmp); - _mm_storeu_si128((__m128i *)to + 56, tmp); - _mm_storeu_si128((__m128i *)to + 57, tmp); - _mm_storeu_si128((__m128i *)to + 58, tmp); - _mm_storeu_si128((__m128i *)to + 59, tmp); - _mm_storeu_si128((__m128i *)to + 60, tmp); - _mm_storeu_si128((__m128i *)to + 61, tmp); - _mm_storeu_si128((__m128i *)to + 62, tmp); - _mm_storeu_si128((__m128i *)to + 63, tmp); - to += 256; - case 0x07: -#ifdef NO_ZEROS - tmp = _mm_loadu_si128((__m128i *)static_mask_1); -#else - tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); -#endif - _mm_storeu_si128((__m128i *)to, tmp); - _mm_storeu_si128((__m128i *)to + 1, tmp); - _mm_storeu_si128((__m128i *)to + 2, tmp); - _mm_storeu_si128((__m128i *)to + 3, tmp); - _mm_storeu_si128((__m128i *)to + 4, tmp); - _mm_storeu_si128((__m128i *)to + 5, tmp); - _mm_storeu_si128((__m128i *)to + 6, tmp); - _mm_storeu_si128((__m128i *)to + 7, tmp); - _mm_storeu_si128((__m128i *)to + 8, tmp); - _mm_storeu_si128((__m128i *)to + 9, tmp); - _mm_storeu_si128((__m128i *)to + 10, tmp); - _mm_storeu_si128((__m128i *)to + 11, tmp); - _mm_storeu_si128((__m128i *)to + 12, tmp); - _mm_storeu_si128((__m128i *)to + 13, tmp); - _mm_storeu_si128((__m128i *)to + 14, tmp); - _mm_storeu_si128((__m128i *)to + 15, tmp); - _mm_storeu_si128((__m128i *)to + 16, tmp); - _mm_storeu_si128((__m128i *)to + 17, tmp); - _mm_storeu_si128((__m128i *)to + 18, tmp); - _mm_storeu_si128((__m128i *)to + 19, tmp); - _mm_storeu_si128((__m128i *)to + 20, tmp); - _mm_storeu_si128((__m128i *)to + 21, tmp); - _mm_storeu_si128((__m128i *)to + 22, tmp); - _mm_storeu_si128((__m128i *)to + 23, tmp); - _mm_storeu_si128((__m128i *)to + 24, tmp); - _mm_storeu_si128((__m128i *)to + 25, tmp); - _mm_storeu_si128((__m128i *)to + 26, tmp); - _mm_storeu_si128((__m128i *)to + 27, tmp); - _mm_storeu_si128((__m128i *)to + 28, tmp); - _mm_storeu_si128((__m128i *)to + 29, tmp); - _mm_storeu_si128((__m128i *)to + 30, tmp); - _mm_storeu_si128((__m128i *)to + 31, tmp); - _mm_storeu_si128((__m128i *)to + 32, tmp); - _mm_storeu_si128((__m128i *)to + 33, tmp); - _mm_storeu_si128((__m128i *)to + 34, tmp); - _mm_storeu_si128((__m128i *)to + 35, tmp); - _mm_storeu_si128((__m128i *)to + 36, tmp); - _mm_storeu_si128((__m128i *)to + 37, tmp); - _mm_storeu_si128((__m128i *)to + 38, tmp); - _mm_storeu_si128((__m128i *)to + 39, tmp); - _mm_storeu_si128((__m128i *)to + 40, tmp); - _mm_storeu_si128((__m128i *)to + 41, tmp); - _mm_storeu_si128((__m128i *)to + 42, tmp); - _mm_storeu_si128((__m128i *)to + 43, tmp); - _mm_storeu_si128((__m128i *)to + 44, tmp); - _mm_storeu_si128((__m128i *)to + 45, tmp); - _mm_storeu_si128((__m128i *)to + 46, tmp); - _mm_storeu_si128((__m128i *)to + 47, tmp); - _mm_storeu_si128((__m128i *)to + 48, tmp); - _mm_storeu_si128((__m128i *)to + 49, tmp); - _mm_storeu_si128((__m128i *)to + 50, tmp); - _mm_storeu_si128((__m128i *)to + 51, tmp); - _mm_storeu_si128((__m128i *)to + 52, tmp); - _mm_storeu_si128((__m128i *)to + 53, tmp); - _mm_storeu_si128((__m128i *)to + 54, tmp); - _mm_storeu_si128((__m128i *)to + 55, tmp); - _mm_storeu_si128((__m128i *)to + 56, tmp); - _mm_storeu_si128((__m128i *)to + 57, tmp); - _mm_storeu_si128((__m128i *)to + 58, tmp); - _mm_storeu_si128((__m128i *)to + 59, tmp); - _mm_storeu_si128((__m128i *)to + 60, tmp); - _mm_storeu_si128((__m128i *)to + 61, tmp); - _mm_storeu_si128((__m128i *)to + 62, tmp); - _mm_storeu_si128((__m128i *)to + 63, tmp); - to += 256; - case 0x08: -#ifdef NO_ZEROS - tmp = _mm_loadu_si128((__m128i *)static_mask_1); -#else - tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); -#endif - _mm_storeu_si128((__m128i *)to, tmp); - _mm_storeu_si128((__m128i *)to + 1, tmp); - _mm_storeu_si128((__m128i *)to + 2, tmp); - _mm_storeu_si128((__m128i *)to + 3, tmp); - _mm_storeu_si128((__m128i *)to + 4, tmp); - _mm_storeu_si128((__m128i *)to + 5, tmp); - _mm_storeu_si128((__m128i *)to + 6, tmp); - _mm_storeu_si128((__m128i *)to + 7, tmp); - _mm_storeu_si128((__m128i *)to + 8, tmp); - _mm_storeu_si128((__m128i *)to + 9, tmp); - _mm_storeu_si128((__m128i *)to + 10, tmp); - _mm_storeu_si128((__m128i *)to + 11, tmp); - _mm_storeu_si128((__m128i *)to + 12, tmp); - _mm_storeu_si128((__m128i *)to + 13, tmp); - _mm_storeu_si128((__m128i *)to + 14, tmp); - _mm_storeu_si128((__m128i *)to + 15, tmp); - _mm_storeu_si128((__m128i *)to + 16, tmp); - _mm_storeu_si128((__m128i *)to + 17, tmp); - _mm_storeu_si128((__m128i *)to + 18, tmp); - _mm_storeu_si128((__m128i *)to + 19, tmp); - _mm_storeu_si128((__m128i *)to + 20, tmp); - _mm_storeu_si128((__m128i *)to + 21, tmp); - _mm_storeu_si128((__m128i *)to + 22, tmp); - _mm_storeu_si128((__m128i *)to + 23, tmp); - _mm_storeu_si128((__m128i *)to + 24, tmp); - _mm_storeu_si128((__m128i *)to + 25, tmp); - _mm_storeu_si128((__m128i *)to + 26, tmp); - _mm_storeu_si128((__m128i *)to + 27, tmp); - _mm_storeu_si128((__m128i *)to + 28, tmp); - _mm_storeu_si128((__m128i *)to + 29, tmp); - _mm_storeu_si128((__m128i *)to + 30, tmp); - _mm_storeu_si128((__m128i *)to + 31, tmp); - _mm_storeu_si128((__m128i *)to + 32, tmp); - _mm_storeu_si128((__m128i *)to + 33, tmp); - _mm_storeu_si128((__m128i *)to + 34, tmp); - _mm_storeu_si128((__m128i *)to + 35, tmp); - _mm_storeu_si128((__m128i *)to + 36, tmp); - _mm_storeu_si128((__m128i *)to + 37, tmp); - _mm_storeu_si128((__m128i *)to + 38, tmp); - _mm_storeu_si128((__m128i *)to + 39, tmp); - _mm_storeu_si128((__m128i *)to + 40, tmp); - _mm_storeu_si128((__m128i *)to + 41, tmp); - _mm_storeu_si128((__m128i *)to + 42, tmp); - _mm_storeu_si128((__m128i *)to + 43, tmp); - _mm_storeu_si128((__m128i *)to + 44, tmp); - _mm_storeu_si128((__m128i *)to + 45, tmp); - _mm_storeu_si128((__m128i *)to + 46, tmp); - _mm_storeu_si128((__m128i *)to + 47, tmp); - _mm_storeu_si128((__m128i *)to + 48, tmp); - _mm_storeu_si128((__m128i *)to + 49, tmp); - _mm_storeu_si128((__m128i *)to + 50, tmp); - _mm_storeu_si128((__m128i *)to + 51, tmp); - _mm_storeu_si128((__m128i *)to + 52, tmp); - _mm_storeu_si128((__m128i *)to + 53, tmp); - _mm_storeu_si128((__m128i *)to + 54, tmp); - _mm_storeu_si128((__m128i *)to + 55, tmp); - _mm_storeu_si128((__m128i *)to + 56, tmp); - _mm_storeu_si128((__m128i *)to + 57, tmp); - _mm_storeu_si128((__m128i *)to + 58, tmp); - _mm_storeu_si128((__m128i *)to + 59, tmp); - _mm_storeu_si128((__m128i *)to + 60, tmp); - _mm_storeu_si128((__m128i *)to + 61, tmp); - _mm_storeu_si128((__m128i *)to + 62, tmp); - _mm_storeu_si128((__m128i *)to + 63, tmp); - to += 256; - case 0x09: -#ifdef NO_ZEROS - tmp = _mm_loadu_si128((__m128i *)static_mask_1); -#else - tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); -#endif - _mm_storeu_si128((__m128i *)to, tmp); - _mm_storeu_si128((__m128i *)to + 1, tmp); - _mm_storeu_si128((__m128i *)to + 2, tmp); - _mm_storeu_si128((__m128i *)to + 3, tmp); - _mm_storeu_si128((__m128i *)to + 4, tmp); - _mm_storeu_si128((__m128i *)to + 5, tmp); - _mm_storeu_si128((__m128i *)to + 6, tmp); - _mm_storeu_si128((__m128i *)to + 7, tmp); - _mm_storeu_si128((__m128i *)to + 8, tmp); - _mm_storeu_si128((__m128i *)to + 9, tmp); - _mm_storeu_si128((__m128i *)to + 10, tmp); - _mm_storeu_si128((__m128i *)to + 11, tmp); - _mm_storeu_si128((__m128i *)to + 12, tmp); - _mm_storeu_si128((__m128i *)to + 13, tmp); - _mm_storeu_si128((__m128i *)to + 14, tmp); - _mm_storeu_si128((__m128i *)to + 15, tmp); - _mm_storeu_si128((__m128i *)to + 16, tmp); - _mm_storeu_si128((__m128i *)to + 17, tmp); - _mm_storeu_si128((__m128i *)to + 18, tmp); - _mm_storeu_si128((__m128i *)to + 19, tmp); - _mm_storeu_si128((__m128i *)to + 20, tmp); - _mm_storeu_si128((__m128i *)to + 21, tmp); - _mm_storeu_si128((__m128i *)to + 22, tmp); - _mm_storeu_si128((__m128i *)to + 23, tmp); - _mm_storeu_si128((__m128i *)to + 24, tmp); - _mm_storeu_si128((__m128i *)to + 25, tmp); - _mm_storeu_si128((__m128i *)to + 26, tmp); - _mm_storeu_si128((__m128i *)to + 27, tmp); - _mm_storeu_si128((__m128i *)to + 28, tmp); - _mm_storeu_si128((__m128i *)to + 29, tmp); - _mm_storeu_si128((__m128i *)to + 30, tmp); - _mm_storeu_si128((__m128i *)to + 31, tmp); - _mm_storeu_si128((__m128i *)to + 32, tmp); - _mm_storeu_si128((__m128i *)to + 33, tmp); - _mm_storeu_si128((__m128i *)to + 34, tmp); - _mm_storeu_si128((__m128i *)to + 35, tmp); - _mm_storeu_si128((__m128i *)to + 36, tmp); - _mm_storeu_si128((__m128i *)to + 37, tmp); - _mm_storeu_si128((__m128i *)to + 38, tmp); - _mm_storeu_si128((__m128i *)to + 39, tmp); - _mm_storeu_si128((__m128i *)to + 40, tmp); - _mm_storeu_si128((__m128i *)to + 41, tmp); - _mm_storeu_si128((__m128i *)to + 42, tmp); - _mm_storeu_si128((__m128i *)to + 43, tmp); - _mm_storeu_si128((__m128i *)to + 44, tmp); - _mm_storeu_si128((__m128i *)to + 45, tmp); - _mm_storeu_si128((__m128i *)to + 46, tmp); - _mm_storeu_si128((__m128i *)to + 47, tmp); - _mm_storeu_si128((__m128i *)to + 48, tmp); - _mm_storeu_si128((__m128i *)to + 49, tmp); - _mm_storeu_si128((__m128i *)to + 50, tmp); - _mm_storeu_si128((__m128i *)to + 51, tmp); - _mm_storeu_si128((__m128i *)to + 52, tmp); - _mm_storeu_si128((__m128i *)to + 53, tmp); - _mm_storeu_si128((__m128i *)to + 54, tmp); - _mm_storeu_si128((__m128i *)to + 55, tmp); - _mm_storeu_si128((__m128i *)to + 56, tmp); - _mm_storeu_si128((__m128i *)to + 57, tmp); - _mm_storeu_si128((__m128i *)to + 58, tmp); - _mm_storeu_si128((__m128i *)to + 59, tmp); - _mm_storeu_si128((__m128i *)to + 60, tmp); - _mm_storeu_si128((__m128i *)to + 61, tmp); - _mm_storeu_si128((__m128i *)to + 62, tmp); - _mm_storeu_si128((__m128i *)to + 63, tmp); - to += 256; - case 0x0a: -#ifdef NO_ZEROS - tmp = _mm_loadu_si128((__m128i *)static_mask_1); -#else - tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); -#endif - _mm_storeu_si128((__m128i *)to, tmp); - _mm_storeu_si128((__m128i *)to + 1, tmp); - _mm_storeu_si128((__m128i *)to + 2, tmp); - _mm_storeu_si128((__m128i *)to + 3, tmp); - _mm_storeu_si128((__m128i *)to + 4, tmp); - _mm_storeu_si128((__m128i *)to + 5, tmp); - _mm_storeu_si128((__m128i *)to + 6, tmp); - _mm_storeu_si128((__m128i *)to + 7, tmp); - _mm_storeu_si128((__m128i *)to + 8, tmp); - _mm_storeu_si128((__m128i *)to + 9, tmp); - _mm_storeu_si128((__m128i *)to + 10, tmp); - _mm_storeu_si128((__m128i *)to + 11, tmp); - _mm_storeu_si128((__m128i *)to + 12, tmp); - _mm_storeu_si128((__m128i *)to + 13, tmp); - _mm_storeu_si128((__m128i *)to + 14, tmp); - _mm_storeu_si128((__m128i *)to + 15, tmp); - _mm_storeu_si128((__m128i *)to + 16, tmp); - _mm_storeu_si128((__m128i *)to + 17, tmp); - _mm_storeu_si128((__m128i *)to + 18, tmp); - _mm_storeu_si128((__m128i *)to + 19, tmp); - _mm_storeu_si128((__m128i *)to + 20, tmp); - _mm_storeu_si128((__m128i *)to + 21, tmp); - _mm_storeu_si128((__m128i *)to + 22, tmp); - _mm_storeu_si128((__m128i *)to + 23, tmp); - _mm_storeu_si128((__m128i *)to + 24, tmp); - _mm_storeu_si128((__m128i *)to + 25, tmp); - _mm_storeu_si128((__m128i *)to + 26, tmp); - _mm_storeu_si128((__m128i *)to + 27, tmp); - _mm_storeu_si128((__m128i *)to + 28, tmp); - _mm_storeu_si128((__m128i *)to + 29, tmp); - _mm_storeu_si128((__m128i *)to + 30, tmp); - _mm_storeu_si128((__m128i *)to + 31, tmp); - _mm_storeu_si128((__m128i *)to + 32, tmp); - _mm_storeu_si128((__m128i *)to + 33, tmp); - _mm_storeu_si128((__m128i *)to + 34, tmp); - _mm_storeu_si128((__m128i *)to + 35, tmp); - _mm_storeu_si128((__m128i *)to + 36, tmp); - _mm_storeu_si128((__m128i *)to + 37, tmp); - _mm_storeu_si128((__m128i *)to + 38, tmp); - _mm_storeu_si128((__m128i *)to + 39, tmp); - _mm_storeu_si128((__m128i *)to + 40, tmp); - _mm_storeu_si128((__m128i *)to + 41, tmp); - _mm_storeu_si128((__m128i *)to + 42, tmp); - _mm_storeu_si128((__m128i *)to + 43, tmp); - _mm_storeu_si128((__m128i *)to + 44, tmp); - _mm_storeu_si128((__m128i *)to + 45, tmp); - _mm_storeu_si128((__m128i *)to + 46, tmp); - _mm_storeu_si128((__m128i *)to + 47, tmp); - _mm_storeu_si128((__m128i *)to + 48, tmp); - _mm_storeu_si128((__m128i *)to + 49, tmp); - _mm_storeu_si128((__m128i *)to + 50, tmp); - _mm_storeu_si128((__m128i *)to + 51, tmp); - _mm_storeu_si128((__m128i *)to + 52, tmp); - _mm_storeu_si128((__m128i *)to + 53, tmp); - _mm_storeu_si128((__m128i *)to + 54, tmp); - _mm_storeu_si128((__m128i *)to + 55, tmp); - _mm_storeu_si128((__m128i *)to + 56, tmp); - _mm_storeu_si128((__m128i *)to + 57, tmp); - _mm_storeu_si128((__m128i *)to + 58, tmp); - _mm_storeu_si128((__m128i *)to + 59, tmp); - _mm_storeu_si128((__m128i *)to + 60, tmp); - _mm_storeu_si128((__m128i *)to + 61, tmp); - _mm_storeu_si128((__m128i *)to + 62, tmp); - _mm_storeu_si128((__m128i *)to + 63, tmp); - to += 256; - case 0x0b: -#ifdef NO_ZEROS - tmp = _mm_loadu_si128((__m128i *)static_mask_1); -#else - tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); -#endif - _mm_storeu_si128((__m128i *)to, tmp); - _mm_storeu_si128((__m128i *)to + 1, tmp); - _mm_storeu_si128((__m128i *)to + 2, tmp); - _mm_storeu_si128((__m128i *)to + 3, tmp); - _mm_storeu_si128((__m128i *)to + 4, tmp); - _mm_storeu_si128((__m128i *)to + 5, tmp); - _mm_storeu_si128((__m128i *)to + 6, tmp); - _mm_storeu_si128((__m128i *)to + 7, tmp); - _mm_storeu_si128((__m128i *)to + 8, tmp); - _mm_storeu_si128((__m128i *)to + 9, tmp); - _mm_storeu_si128((__m128i *)to + 10, tmp); - _mm_storeu_si128((__m128i *)to + 11, tmp); - _mm_storeu_si128((__m128i *)to + 12, tmp); - _mm_storeu_si128((__m128i *)to + 13, tmp); - _mm_storeu_si128((__m128i *)to + 14, tmp); - _mm_storeu_si128((__m128i *)to + 15, tmp); - _mm_storeu_si128((__m128i *)to + 16, tmp); - _mm_storeu_si128((__m128i *)to + 17, tmp); - _mm_storeu_si128((__m128i *)to + 18, tmp); - _mm_storeu_si128((__m128i *)to + 19, tmp); - _mm_storeu_si128((__m128i *)to + 20, tmp); - _mm_storeu_si128((__m128i *)to + 21, tmp); - _mm_storeu_si128((__m128i *)to + 22, tmp); - _mm_storeu_si128((__m128i *)to + 23, tmp); - _mm_storeu_si128((__m128i *)to + 24, tmp); - _mm_storeu_si128((__m128i *)to + 25, tmp); - _mm_storeu_si128((__m128i *)to + 26, tmp); - _mm_storeu_si128((__m128i *)to + 27, tmp); - _mm_storeu_si128((__m128i *)to + 28, tmp); - _mm_storeu_si128((__m128i *)to + 29, tmp); - _mm_storeu_si128((__m128i *)to + 30, tmp); - _mm_storeu_si128((__m128i *)to + 31, tmp); - _mm_storeu_si128((__m128i *)to + 32, tmp); - _mm_storeu_si128((__m128i *)to + 33, tmp); - _mm_storeu_si128((__m128i *)to + 34, tmp); - _mm_storeu_si128((__m128i *)to + 35, tmp); - _mm_storeu_si128((__m128i *)to + 36, tmp); - _mm_storeu_si128((__m128i *)to + 37, tmp); - _mm_storeu_si128((__m128i *)to + 38, tmp); - _mm_storeu_si128((__m128i *)to + 39, tmp); - _mm_storeu_si128((__m128i *)to + 40, tmp); - _mm_storeu_si128((__m128i *)to + 41, tmp); - _mm_storeu_si128((__m128i *)to + 42, tmp); - _mm_storeu_si128((__m128i *)to + 43, tmp); - _mm_storeu_si128((__m128i *)to + 44, tmp); - _mm_storeu_si128((__m128i *)to + 45, tmp); - _mm_storeu_si128((__m128i *)to + 46, tmp); - _mm_storeu_si128((__m128i *)to + 47, tmp); - _mm_storeu_si128((__m128i *)to + 48, tmp); - _mm_storeu_si128((__m128i *)to + 49, tmp); - _mm_storeu_si128((__m128i *)to + 50, tmp); - _mm_storeu_si128((__m128i *)to + 51, tmp); - _mm_storeu_si128((__m128i *)to + 52, tmp); - _mm_storeu_si128((__m128i *)to + 53, tmp); - _mm_storeu_si128((__m128i *)to + 54, tmp); - _mm_storeu_si128((__m128i *)to + 55, tmp); - _mm_storeu_si128((__m128i *)to + 56, tmp); - _mm_storeu_si128((__m128i *)to + 57, tmp); - _mm_storeu_si128((__m128i *)to + 58, tmp); - _mm_storeu_si128((__m128i *)to + 59, tmp); - _mm_storeu_si128((__m128i *)to + 60, tmp); - _mm_storeu_si128((__m128i *)to + 61, tmp); - _mm_storeu_si128((__m128i *)to + 62, tmp); - _mm_storeu_si128((__m128i *)to + 63, tmp); - to += 256; - case 0x0c: -#ifdef NO_ZEROS - tmp = _mm_loadu_si128((__m128i *)static_mask_1); -#else - tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); -#endif - _mm_storeu_si128((__m128i *)to, tmp); - _mm_storeu_si128((__m128i *)to + 1, tmp); - _mm_storeu_si128((__m128i *)to + 2, tmp); - _mm_storeu_si128((__m128i *)to + 3, tmp); - _mm_storeu_si128((__m128i *)to + 4, tmp); - _mm_storeu_si128((__m128i *)to + 5, tmp); - _mm_storeu_si128((__m128i *)to + 6, tmp); - _mm_storeu_si128((__m128i *)to + 7, tmp); - _mm_storeu_si128((__m128i *)to + 8, tmp); - _mm_storeu_si128((__m128i *)to + 9, tmp); - _mm_storeu_si128((__m128i *)to + 10, tmp); - _mm_storeu_si128((__m128i *)to + 11, tmp); - _mm_storeu_si128((__m128i *)to + 12, tmp); - _mm_storeu_si128((__m128i *)to + 13, tmp); - _mm_storeu_si128((__m128i *)to + 14, tmp); - _mm_storeu_si128((__m128i *)to + 15, tmp); - _mm_storeu_si128((__m128i *)to + 16, tmp); - _mm_storeu_si128((__m128i *)to + 17, tmp); - _mm_storeu_si128((__m128i *)to + 18, tmp); - _mm_storeu_si128((__m128i *)to + 19, tmp); - _mm_storeu_si128((__m128i *)to + 20, tmp); - _mm_storeu_si128((__m128i *)to + 21, tmp); - _mm_storeu_si128((__m128i *)to + 22, tmp); - _mm_storeu_si128((__m128i *)to + 23, tmp); - _mm_storeu_si128((__m128i *)to + 24, tmp); - _mm_storeu_si128((__m128i *)to + 25, tmp); - _mm_storeu_si128((__m128i *)to + 26, tmp); - _mm_storeu_si128((__m128i *)to + 27, tmp); - _mm_storeu_si128((__m128i *)to + 28, tmp); - _mm_storeu_si128((__m128i *)to + 29, tmp); - _mm_storeu_si128((__m128i *)to + 30, tmp); - _mm_storeu_si128((__m128i *)to + 31, tmp); - _mm_storeu_si128((__m128i *)to + 32, tmp); - _mm_storeu_si128((__m128i *)to + 33, tmp); - _mm_storeu_si128((__m128i *)to + 34, tmp); - _mm_storeu_si128((__m128i *)to + 35, tmp); - _mm_storeu_si128((__m128i *)to + 36, tmp); - _mm_storeu_si128((__m128i *)to + 37, tmp); - _mm_storeu_si128((__m128i *)to + 38, tmp); - _mm_storeu_si128((__m128i *)to + 39, tmp); - _mm_storeu_si128((__m128i *)to + 40, tmp); - _mm_storeu_si128((__m128i *)to + 41, tmp); - _mm_storeu_si128((__m128i *)to + 42, tmp); - _mm_storeu_si128((__m128i *)to + 43, tmp); - _mm_storeu_si128((__m128i *)to + 44, tmp); - _mm_storeu_si128((__m128i *)to + 45, tmp); - _mm_storeu_si128((__m128i *)to + 46, tmp); - _mm_storeu_si128((__m128i *)to + 47, tmp); - _mm_storeu_si128((__m128i *)to + 48, tmp); - _mm_storeu_si128((__m128i *)to + 49, tmp); - _mm_storeu_si128((__m128i *)to + 50, tmp); - _mm_storeu_si128((__m128i *)to + 51, tmp); - _mm_storeu_si128((__m128i *)to + 52, tmp); - _mm_storeu_si128((__m128i *)to + 53, tmp); - _mm_storeu_si128((__m128i *)to + 54, tmp); - _mm_storeu_si128((__m128i *)to + 55, tmp); - _mm_storeu_si128((__m128i *)to + 56, tmp); - _mm_storeu_si128((__m128i *)to + 57, tmp); - _mm_storeu_si128((__m128i *)to + 58, tmp); - _mm_storeu_si128((__m128i *)to + 59, tmp); - _mm_storeu_si128((__m128i *)to + 60, tmp); - _mm_storeu_si128((__m128i *)to + 61, tmp); - _mm_storeu_si128((__m128i *)to + 62, tmp); - _mm_storeu_si128((__m128i *)to + 63, tmp); - to += 256; - case 0x0d: -#ifdef NO_ZEROS - tmp = _mm_loadu_si128((__m128i *)static_mask_1); -#else - tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); -#endif - _mm_storeu_si128((__m128i *)to, tmp); - _mm_storeu_si128((__m128i *)to + 1, tmp); - _mm_storeu_si128((__m128i *)to + 2, tmp); - _mm_storeu_si128((__m128i *)to + 3, tmp); - _mm_storeu_si128((__m128i *)to + 4, tmp); - _mm_storeu_si128((__m128i *)to + 5, tmp); - _mm_storeu_si128((__m128i *)to + 6, tmp); - _mm_storeu_si128((__m128i *)to + 7, tmp); - _mm_storeu_si128((__m128i *)to + 8, tmp); - _mm_storeu_si128((__m128i *)to + 9, tmp); - _mm_storeu_si128((__m128i *)to + 10, tmp); - _mm_storeu_si128((__m128i *)to + 11, tmp); - _mm_storeu_si128((__m128i *)to + 12, tmp); - _mm_storeu_si128((__m128i *)to + 13, tmp); - _mm_storeu_si128((__m128i *)to + 14, tmp); - _mm_storeu_si128((__m128i *)to + 15, tmp); - _mm_storeu_si128((__m128i *)to + 16, tmp); - _mm_storeu_si128((__m128i *)to + 17, tmp); - _mm_storeu_si128((__m128i *)to + 18, tmp); - _mm_storeu_si128((__m128i *)to + 19, tmp); - _mm_storeu_si128((__m128i *)to + 20, tmp); - _mm_storeu_si128((__m128i *)to + 21, tmp); - _mm_storeu_si128((__m128i *)to + 22, tmp); - _mm_storeu_si128((__m128i *)to + 23, tmp); - _mm_storeu_si128((__m128i *)to + 24, tmp); - _mm_storeu_si128((__m128i *)to + 25, tmp); - _mm_storeu_si128((__m128i *)to + 26, tmp); - _mm_storeu_si128((__m128i *)to + 27, tmp); - _mm_storeu_si128((__m128i *)to + 28, tmp); - _mm_storeu_si128((__m128i *)to + 29, tmp); - _mm_storeu_si128((__m128i *)to + 30, tmp); - _mm_storeu_si128((__m128i *)to + 31, tmp); - _mm_storeu_si128((__m128i *)to + 32, tmp); - _mm_storeu_si128((__m128i *)to + 33, tmp); - _mm_storeu_si128((__m128i *)to + 34, tmp); - _mm_storeu_si128((__m128i *)to + 35, tmp); - _mm_storeu_si128((__m128i *)to + 36, tmp); - _mm_storeu_si128((__m128i *)to + 37, tmp); - _mm_storeu_si128((__m128i *)to + 38, tmp); - _mm_storeu_si128((__m128i *)to + 39, tmp); - _mm_storeu_si128((__m128i *)to + 40, tmp); - _mm_storeu_si128((__m128i *)to + 41, tmp); - _mm_storeu_si128((__m128i *)to + 42, tmp); - _mm_storeu_si128((__m128i *)to + 43, tmp); - _mm_storeu_si128((__m128i *)to + 44, tmp); - _mm_storeu_si128((__m128i *)to + 45, tmp); - _mm_storeu_si128((__m128i *)to + 46, tmp); - _mm_storeu_si128((__m128i *)to + 47, tmp); - _mm_storeu_si128((__m128i *)to + 48, tmp); - _mm_storeu_si128((__m128i *)to + 49, tmp); - _mm_storeu_si128((__m128i *)to + 50, tmp); - _mm_storeu_si128((__m128i *)to + 51, tmp); - _mm_storeu_si128((__m128i *)to + 52, tmp); - _mm_storeu_si128((__m128i *)to + 53, tmp); - _mm_storeu_si128((__m128i *)to + 54, tmp); - _mm_storeu_si128((__m128i *)to + 55, tmp); - _mm_storeu_si128((__m128i *)to + 56, tmp); - _mm_storeu_si128((__m128i *)to + 57, tmp); - _mm_storeu_si128((__m128i *)to + 58, tmp); - _mm_storeu_si128((__m128i *)to + 59, tmp); - _mm_storeu_si128((__m128i *)to + 60, tmp); - _mm_storeu_si128((__m128i *)to + 61, tmp); - _mm_storeu_si128((__m128i *)to + 62, tmp); - _mm_storeu_si128((__m128i *)to + 63, tmp); - to += 256; - case 0x0e: -#ifdef NO_ZEROS - tmp = _mm_loadu_si128((__m128i *)static_mask_1); -#else - tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); -#endif - _mm_storeu_si128((__m128i *)to, tmp); - _mm_storeu_si128((__m128i *)to + 1, tmp); - _mm_storeu_si128((__m128i *)to + 2, tmp); - _mm_storeu_si128((__m128i *)to + 3, tmp); - _mm_storeu_si128((__m128i *)to + 4, tmp); - _mm_storeu_si128((__m128i *)to + 5, tmp); - _mm_storeu_si128((__m128i *)to + 6, tmp); - _mm_storeu_si128((__m128i *)to + 7, tmp); - _mm_storeu_si128((__m128i *)to + 8, tmp); - _mm_storeu_si128((__m128i *)to + 9, tmp); - _mm_storeu_si128((__m128i *)to + 10, tmp); - _mm_storeu_si128((__m128i *)to + 11, tmp); - _mm_storeu_si128((__m128i *)to + 12, tmp); - _mm_storeu_si128((__m128i *)to + 13, tmp); - _mm_storeu_si128((__m128i *)to + 14, tmp); - _mm_storeu_si128((__m128i *)to + 15, tmp); - _mm_storeu_si128((__m128i *)to + 16, tmp); - _mm_storeu_si128((__m128i *)to + 17, tmp); - _mm_storeu_si128((__m128i *)to + 18, tmp); - _mm_storeu_si128((__m128i *)to + 19, tmp); - _mm_storeu_si128((__m128i *)to + 20, tmp); - _mm_storeu_si128((__m128i *)to + 21, tmp); - _mm_storeu_si128((__m128i *)to + 22, tmp); - _mm_storeu_si128((__m128i *)to + 23, tmp); - _mm_storeu_si128((__m128i *)to + 24, tmp); - _mm_storeu_si128((__m128i *)to + 25, tmp); - _mm_storeu_si128((__m128i *)to + 26, tmp); - _mm_storeu_si128((__m128i *)to + 27, tmp); - _mm_storeu_si128((__m128i *)to + 28, tmp); - _mm_storeu_si128((__m128i *)to + 29, tmp); - _mm_storeu_si128((__m128i *)to + 30, tmp); - _mm_storeu_si128((__m128i *)to + 31, tmp); - _mm_storeu_si128((__m128i *)to + 32, tmp); - _mm_storeu_si128((__m128i *)to + 33, tmp); - _mm_storeu_si128((__m128i *)to + 34, tmp); - _mm_storeu_si128((__m128i *)to + 35, tmp); - _mm_storeu_si128((__m128i *)to + 36, tmp); - _mm_storeu_si128((__m128i *)to + 37, tmp); - _mm_storeu_si128((__m128i *)to + 38, tmp); - _mm_storeu_si128((__m128i *)to + 39, tmp); - _mm_storeu_si128((__m128i *)to + 40, tmp); - _mm_storeu_si128((__m128i *)to + 41, tmp); - _mm_storeu_si128((__m128i *)to + 42, tmp); - _mm_storeu_si128((__m128i *)to + 43, tmp); - _mm_storeu_si128((__m128i *)to + 44, tmp); - _mm_storeu_si128((__m128i *)to + 45, tmp); - _mm_storeu_si128((__m128i *)to + 46, tmp); - _mm_storeu_si128((__m128i *)to + 47, tmp); - _mm_storeu_si128((__m128i *)to + 48, tmp); - _mm_storeu_si128((__m128i *)to + 49, tmp); - _mm_storeu_si128((__m128i *)to + 50, tmp); - _mm_storeu_si128((__m128i *)to + 51, tmp); - _mm_storeu_si128((__m128i *)to + 52, tmp); - _mm_storeu_si128((__m128i *)to + 53, tmp); - _mm_storeu_si128((__m128i *)to + 54, tmp); - _mm_storeu_si128((__m128i *)to + 55, tmp); - _mm_storeu_si128((__m128i *)to + 56, tmp); - _mm_storeu_si128((__m128i *)to + 57, tmp); - _mm_storeu_si128((__m128i *)to + 58, tmp); - _mm_storeu_si128((__m128i *)to + 59, tmp); - _mm_storeu_si128((__m128i *)to + 60, tmp); - _mm_storeu_si128((__m128i *)to + 61, tmp); - _mm_storeu_si128((__m128i *)to + 62, tmp); - _mm_storeu_si128((__m128i *)to + 63, tmp); - to += 256; - case 0x0f: -#ifdef NO_ZEROS - tmp = _mm_loadu_si128((__m128i *)static_mask_1); -#else - tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); -#endif - _mm_storeu_si128((__m128i *)to, tmp); - _mm_storeu_si128((__m128i *)to + 1, tmp); - _mm_storeu_si128((__m128i *)to + 2, tmp); - _mm_storeu_si128((__m128i *)to + 3, tmp); - _mm_storeu_si128((__m128i *)to + 4, tmp); - _mm_storeu_si128((__m128i *)to + 5, tmp); - _mm_storeu_si128((__m128i *)to + 6, tmp); - _mm_storeu_si128((__m128i *)to + 7, tmp); - _mm_storeu_si128((__m128i *)to + 8, tmp); - _mm_storeu_si128((__m128i *)to + 9, tmp); - _mm_storeu_si128((__m128i *)to + 10, tmp); - _mm_storeu_si128((__m128i *)to + 11, tmp); - _mm_storeu_si128((__m128i *)to + 12, tmp); - _mm_storeu_si128((__m128i *)to + 13, tmp); - _mm_storeu_si128((__m128i *)to + 14, tmp); - _mm_storeu_si128((__m128i *)to + 15, tmp); - _mm_storeu_si128((__m128i *)to + 16, tmp); - _mm_storeu_si128((__m128i *)to + 17, tmp); - _mm_storeu_si128((__m128i *)to + 18, tmp); - _mm_storeu_si128((__m128i *)to + 19, tmp); - _mm_storeu_si128((__m128i *)to + 20, tmp); - _mm_storeu_si128((__m128i *)to + 21, tmp); - _mm_storeu_si128((__m128i *)to + 22, tmp); - _mm_storeu_si128((__m128i *)to + 23, tmp); - _mm_storeu_si128((__m128i *)to + 24, tmp); - _mm_storeu_si128((__m128i *)to + 25, tmp); - _mm_storeu_si128((__m128i *)to + 26, tmp); - _mm_storeu_si128((__m128i *)to + 27, tmp); - _mm_storeu_si128((__m128i *)to + 28, tmp); - _mm_storeu_si128((__m128i *)to + 29, tmp); - _mm_storeu_si128((__m128i *)to + 30, tmp); - _mm_storeu_si128((__m128i *)to + 31, tmp); - _mm_storeu_si128((__m128i *)to + 32, tmp); - _mm_storeu_si128((__m128i *)to + 33, tmp); - _mm_storeu_si128((__m128i *)to + 34, tmp); - _mm_storeu_si128((__m128i *)to + 35, tmp); - _mm_storeu_si128((__m128i *)to + 36, tmp); - _mm_storeu_si128((__m128i *)to + 37, tmp); - _mm_storeu_si128((__m128i *)to + 38, tmp); - _mm_storeu_si128((__m128i *)to + 39, tmp); - _mm_storeu_si128((__m128i *)to + 40, tmp); - _mm_storeu_si128((__m128i *)to + 41, tmp); - _mm_storeu_si128((__m128i *)to + 42, tmp); - _mm_storeu_si128((__m128i *)to + 43, tmp); - _mm_storeu_si128((__m128i *)to + 44, tmp); - _mm_storeu_si128((__m128i *)to + 45, tmp); - _mm_storeu_si128((__m128i *)to + 46, tmp); - _mm_storeu_si128((__m128i *)to + 47, tmp); - _mm_storeu_si128((__m128i *)to + 48, tmp); - _mm_storeu_si128((__m128i *)to + 49, tmp); - _mm_storeu_si128((__m128i *)to + 50, tmp); - _mm_storeu_si128((__m128i *)to + 51, tmp); - _mm_storeu_si128((__m128i *)to + 52, tmp); - _mm_storeu_si128((__m128i *)to + 53, tmp); - _mm_storeu_si128((__m128i *)to + 54, tmp); - _mm_storeu_si128((__m128i *)to + 55, tmp); - _mm_storeu_si128((__m128i *)to + 56, tmp); - _mm_storeu_si128((__m128i *)to + 57, tmp); - _mm_storeu_si128((__m128i *)to + 58, tmp); - _mm_storeu_si128((__m128i *)to + 59, tmp); - _mm_storeu_si128((__m128i *)to + 60, tmp); - _mm_storeu_si128((__m128i *)to + 61, tmp); - _mm_storeu_si128((__m128i *)to + 62, tmp); - _mm_storeu_si128((__m128i *)to + 63, tmp); - to += 256; - break; - case 0x10: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); - in += 16; - to += 128; - case 0x11: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); - in += 16; - to += 128; - case 0x12: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); - in += 16; - to += 128; - case 0x13: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); - in += 16; - to += 128; - case 0x14: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); - in += 16; - to += 128; - case 0x15: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); - in += 16; - to += 128; - case 0x16: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); - in += 16; - to += 128; - case 0x17: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); - in += 16; - to += 128; - case 0x18: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); - in += 16; - to += 128; - case 0x19: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); - in += 16; - to += 128; - case 0x1a: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); - in += 16; - to += 128; - case 0x1b: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); - in += 16; - to += 128; - case 0x1c: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); - in += 16; - to += 128; - case 0x1d: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); - in += 16; - to += 128; - case 0x1e: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); - in += 16; - to += 128; - case 0x1f: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); - byte_stream = _mm_srli_epi64(byte_stream, 1); - _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); - in += 16; - to += 128; - break; - case 0x20: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); - in += 16; - to += 64; - case 0x21: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); - in += 16; - to += 64; - case 0x22: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); - in += 16; - to += 64; - case 0x23: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); - in += 16; - to += 64; - case 0x24: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); - in += 16; - to += 64; - case 0x25: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); - in += 16; - to += 64; - case 0x26: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); - in += 16; - to += 64; - case 0x27: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); - in += 16; - to += 64; - case 0x28: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); - in += 16; - to += 64; - case 0x29: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); - in += 16; - to += 64; - case 0x2a: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); - in += 16; - to += 64; - case 0x2b: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); - in += 16; - to += 64; - case 0x2c: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); - in += 16; - to += 64; - case 0x2d: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); - in += 16; - to += 64; - case 0x2e: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); - in += 16; - to += 64; - case 0x2f: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); - byte_stream = _mm_srli_epi64(byte_stream, 2); - _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); - in += 16; - to += 64; - break; - case 0x30: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); - in += 16; - to += 40; - case 0x31: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); - in += 16; - to += 40; - case 0x32: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); - in += 16; - to += 40; - case 0x33: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); - in += 16; - to += 40; - case 0x34: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); - in += 16; - to += 40; - case 0x35: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); - in += 16; - to += 40; - case 0x36: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); - in += 16; - to += 40; - case 0x37: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); - in += 16; - to += 40; - case 0x38: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); - in += 16; - to += 40; - case 0x39: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); - in += 16; - to += 40; - case 0x3a: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); - in += 16; - to += 40; - case 0x3b: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); - in += 16; - to += 40; - case 0x3c: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); - in += 16; - to += 40; - case 0x3d: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); - in += 16; - to += 40; - case 0x3e: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); - in += 16; - to += 40; - case 0x3f: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); - byte_stream = _mm_srli_epi64(byte_stream, 3); - _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); - in += 16; - to += 40; - break; - case 0x40: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); - in += 16; - to += 32; - case 0x41: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); - in += 16; - to += 32; - case 0x42: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); - in += 16; - to += 32; - case 0x43: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); - in += 16; - to += 32; - case 0x44: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); - in += 16; - to += 32; - case 0x45: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); - in += 16; - to += 32; - case 0x46: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); - in += 16; - to += 32; - case 0x47: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); - in += 16; - to += 32; - case 0x48: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); - in += 16; - to += 32; - case 0x49: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); - in += 16; - to += 32; - case 0x4a: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); - in += 16; - to += 32; - case 0x4b: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); - in += 16; - to += 32; - case 0x4c: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); - in += 16; - to += 32; - case 0x4d: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); - in += 16; - to += 32; - case 0x4e: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); - in += 16; - to += 32; - case 0x4f: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); - byte_stream = _mm_srli_epi64(byte_stream, 4); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); - in += 16; - to += 32; - break; - case 0x50: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); - in += 16; - to += 24; - case 0x51: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); - in += 16; - to += 24; - case 0x52: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); - in += 16; - to += 24; - case 0x53: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); - in += 16; - to += 24; - case 0x54: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); - in += 16; - to += 24; - case 0x55: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); - in += 16; - to += 24; - case 0x56: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); - in += 16; - to += 24; - case 0x57: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); - in += 16; - to += 24; - case 0x58: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); - in += 16; - to += 24; - case 0x59: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); - in += 16; - to += 24; - case 0x5a: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); - in += 16; - to += 24; - case 0x5b: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); - in += 16; - to += 24; - case 0x5c: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); - in += 16; - to += 24; - case 0x5d: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); - in += 16; - to += 24; - case 0x5e: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); - in += 16; - to += 24; - case 0x5f: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); - byte_stream = _mm_srli_epi64(byte_stream, 5); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); - in += 16; - to += 24; - break; - case 0x60: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); - in += 16; - to += 20; - case 0x61: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); - in += 16; - to += 20; - case 0x62: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); - in += 16; - to += 20; - case 0x63: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); - in += 16; - to += 20; - case 0x64: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); - in += 16; - to += 20; - case 0x65: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); - in += 16; - to += 20; - case 0x66: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); - in += 16; - to += 20; - case 0x67: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); - in += 16; - to += 20; - case 0x68: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); - in += 16; - to += 20; - case 0x69: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); - in += 16; - to += 20; - case 0x6a: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); - in += 16; - to += 20; - case 0x6b: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); - in += 16; - to += 20; - case 0x6c: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); - in += 16; - to += 20; - case 0x6d: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); - in += 16; - to += 20; - case 0x6e: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); - in += 16; - to += 20; - case 0x6f: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); - byte_stream = _mm_srli_epi64(byte_stream, 6); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); - in += 16; - to += 20; - break; - case 0x70: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); - byte_stream = _mm_srli_epi32(byte_stream_2, 3); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); - in += 32; - to += 36; - case 0x71: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); - byte_stream = _mm_srli_epi32(byte_stream_2, 3); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); - in += 32; - to += 36; - case 0x72: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); - byte_stream = _mm_srli_epi32(byte_stream_2, 3); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); - in += 32; - to += 36; - case 0x73: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); - byte_stream = _mm_srli_epi32(byte_stream_2, 3); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); - in += 32; - to += 36; - case 0x74: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); - byte_stream = _mm_srli_epi32(byte_stream_2, 3); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); - in += 32; - to += 36; - case 0x75: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); - byte_stream = _mm_srli_epi32(byte_stream_2, 3); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); - in += 32; - to += 36; - case 0x76: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); - byte_stream = _mm_srli_epi32(byte_stream_2, 3); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); - in += 32; - to += 36; - case 0x77: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); - byte_stream = _mm_srli_epi32(byte_stream_2, 3); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); - in += 32; - to += 36; - case 0x78: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); - byte_stream = _mm_srli_epi32(byte_stream_2, 3); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); - in += 32; - to += 36; - case 0x79: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); - byte_stream = _mm_srli_epi32(byte_stream_2, 3); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); - in += 32; - to += 36; - case 0x7a: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); - byte_stream = _mm_srli_epi32(byte_stream_2, 3); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); - in += 32; - to += 36; - case 0x7b: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); - byte_stream = _mm_srli_epi32(byte_stream_2, 3); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); - in += 32; - to += 36; - case 0x7c: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); - byte_stream = _mm_srli_epi32(byte_stream_2, 3); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); - in += 32; - to += 36; - case 0x7d: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); - byte_stream = _mm_srli_epi32(byte_stream_2, 3); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); - in += 32; - to += 36; - case 0x7e: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); - byte_stream = _mm_srli_epi32(byte_stream_2, 3); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); - in += 32; - to += 36; - case 0x7f: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); - byte_stream = _mm_srli_epi32(byte_stream_2, 3); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); - byte_stream = _mm_srli_epi32(byte_stream, 7); - _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); - in += 32; - to += 36; - break; - case 0x80: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); - tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); - _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); - tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); - _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); - tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); - _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); - in += 16; - to += 16; - case 0x81: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); - tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); - _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); - tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); - _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); - tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); - _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); - in += 16; - to += 16; - case 0x82: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); - tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); - _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); - tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); - _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); - tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); - _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); - in += 16; - to += 16; - case 0x83: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); - tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); - _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); - tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); - _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); - tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); - _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); - in += 16; - to += 16; - case 0x84: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); - tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); - _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); - tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); - _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); - tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); - _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); - in += 16; - to += 16; - case 0x85: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); - tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); - _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); - tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); - _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); - tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); - _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); - in += 16; - to += 16; - case 0x86: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); - tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); - _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); - tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); - _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); - tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); - _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); - in += 16; - to += 16; - case 0x87: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); - tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); - _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); - tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); - _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); - tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); - _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); - in += 16; - to += 16; - case 0x88: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); - tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); - _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); - tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); - _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); - tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); - _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); - in += 16; - to += 16; - case 0x89: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); - tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); - _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); - tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); - _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); - tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); - _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); - in += 16; - to += 16; - case 0x8a: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); - tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); - _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); - tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); - _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); - tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); - _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); - in += 16; - to += 16; - case 0x8b: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); - tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); - _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); - tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); - _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); - tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); - _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); - in += 16; - to += 16; - case 0x8c: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); - tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); - _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); - tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); - _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); - tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); - _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); - in += 16; - to += 16; - case 0x8d: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); - tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); - _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); - tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); - _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); - tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); - _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); - in += 16; - to += 16; - case 0x8e: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); - tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); - _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); - tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); - _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); - tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); - _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); - in += 16; - to += 16; - case 0x8f: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); - tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); - _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); - tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); - _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); - tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); - _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); - in += 16; - to += 16; - break; - case 0x90: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); - byte_stream = _mm_srli_epi32(byte_stream_2, 4); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); - in += 32; - to += 28; - case 0x91: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); - byte_stream = _mm_srli_epi32(byte_stream_2, 4); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); - in += 32; - to += 28; - case 0x92: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); - byte_stream = _mm_srli_epi32(byte_stream_2, 4); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); - in += 32; - to += 28; - case 0x93: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); - byte_stream = _mm_srli_epi32(byte_stream_2, 4); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); - in += 32; - to += 28; - case 0x94: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); - byte_stream = _mm_srli_epi32(byte_stream_2, 4); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); - in += 32; - to += 28; - case 0x95: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); - byte_stream = _mm_srli_epi32(byte_stream_2, 4); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); - in += 32; - to += 28; - case 0x96: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); - byte_stream = _mm_srli_epi32(byte_stream_2, 4); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); - in += 32; - to += 28; - case 0x97: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); - byte_stream = _mm_srli_epi32(byte_stream_2, 4); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); - in += 32; - to += 28; - case 0x98: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); - byte_stream = _mm_srli_epi32(byte_stream_2, 4); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); - in += 32; - to += 28; - case 0x99: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); - byte_stream = _mm_srli_epi32(byte_stream_2, 4); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); - in += 32; - to += 28; - case 0x9a: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); - byte_stream = _mm_srli_epi32(byte_stream_2, 4); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); - in += 32; - to += 28; - case 0x9b: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); - byte_stream = _mm_srli_epi32(byte_stream_2, 4); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); - in += 32; - to += 28; - case 0x9c: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); - byte_stream = _mm_srli_epi32(byte_stream_2, 4); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); - in += 32; - to += 28; - case 0x9d: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); - byte_stream = _mm_srli_epi32(byte_stream_2, 4); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); - in += 32; - to += 28; - case 0x9e: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); - byte_stream = _mm_srli_epi32(byte_stream_2, 4); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); - in += 32; - to += 28; - case 0x9f: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); - byte_stream = _mm_srli_epi32(byte_stream_2, 4); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); - byte_stream = _mm_srli_epi32(byte_stream, 9); - _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); - in += 32; - to += 28; - break; - case 0xa0: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); - byte_stream = _mm_srli_epi64(byte_stream, 10); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); - byte_stream = _mm_srli_epi64(byte_stream, 10); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); - in += 16; - to += 12; - case 0xa1: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); - byte_stream = _mm_srli_epi64(byte_stream, 10); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); - byte_stream = _mm_srli_epi64(byte_stream, 10); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); - in += 16; - to += 12; - case 0xa2: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); - byte_stream = _mm_srli_epi64(byte_stream, 10); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); - byte_stream = _mm_srli_epi64(byte_stream, 10); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); - in += 16; - to += 12; - case 0xa3: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); - byte_stream = _mm_srli_epi64(byte_stream, 10); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); - byte_stream = _mm_srli_epi64(byte_stream, 10); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); - in += 16; - to += 12; - case 0xa4: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); - byte_stream = _mm_srli_epi64(byte_stream, 10); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); - byte_stream = _mm_srli_epi64(byte_stream, 10); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); - in += 16; - to += 12; - case 0xa5: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); - byte_stream = _mm_srli_epi64(byte_stream, 10); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); - byte_stream = _mm_srli_epi64(byte_stream, 10); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); - in += 16; - to += 12; - case 0xa6: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); - byte_stream = _mm_srli_epi64(byte_stream, 10); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); - byte_stream = _mm_srli_epi64(byte_stream, 10); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); - in += 16; - to += 12; - case 0xa7: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); - byte_stream = _mm_srli_epi64(byte_stream, 10); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); - byte_stream = _mm_srli_epi64(byte_stream, 10); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); - in += 16; - to += 12; - case 0xa8: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); - byte_stream = _mm_srli_epi64(byte_stream, 10); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); - byte_stream = _mm_srli_epi64(byte_stream, 10); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); - in += 16; - to += 12; - case 0xa9: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); - byte_stream = _mm_srli_epi64(byte_stream, 10); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); - byte_stream = _mm_srli_epi64(byte_stream, 10); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); - in += 16; - to += 12; - case 0xaa: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); - byte_stream = _mm_srli_epi64(byte_stream, 10); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); - byte_stream = _mm_srli_epi64(byte_stream, 10); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); - in += 16; - to += 12; - case 0xab: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); - byte_stream = _mm_srli_epi64(byte_stream, 10); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); - byte_stream = _mm_srli_epi64(byte_stream, 10); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); - in += 16; - to += 12; - case 0xac: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); - byte_stream = _mm_srli_epi64(byte_stream, 10); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); - byte_stream = _mm_srli_epi64(byte_stream, 10); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); - in += 16; - to += 12; - case 0xad: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); - byte_stream = _mm_srli_epi64(byte_stream, 10); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); - byte_stream = _mm_srli_epi64(byte_stream, 10); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); - in += 16; - to += 12; - case 0xae: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); - byte_stream = _mm_srli_epi64(byte_stream, 10); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); - byte_stream = _mm_srli_epi64(byte_stream, 10); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); - in += 16; - to += 12; - case 0xaf: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); - byte_stream = _mm_srli_epi64(byte_stream, 10); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); - byte_stream = _mm_srli_epi64(byte_stream, 10); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); - in += 16; - to += 12; - break; - case 0xb0: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); - byte_stream = _mm_srli_epi32(byte_stream, 12); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); - byte_stream = _mm_srli_epi32(byte_stream_2, 8); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); - byte_stream = _mm_srli_epi32(byte_stream, 12); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); - in += 32; - to += 20; - case 0xb1: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); - byte_stream = _mm_srli_epi32(byte_stream, 12); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); - byte_stream = _mm_srli_epi32(byte_stream_2, 8); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); - byte_stream = _mm_srli_epi32(byte_stream, 12); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); - in += 32; - to += 20; - case 0xb2: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); - byte_stream = _mm_srli_epi32(byte_stream, 12); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); - byte_stream = _mm_srli_epi32(byte_stream_2, 8); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); - byte_stream = _mm_srli_epi32(byte_stream, 12); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); - in += 32; - to += 20; - case 0xb3: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); - byte_stream = _mm_srli_epi32(byte_stream, 12); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); - byte_stream = _mm_srli_epi32(byte_stream_2, 8); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); - byte_stream = _mm_srli_epi32(byte_stream, 12); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); - in += 32; - to += 20; - case 0xb4: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); - byte_stream = _mm_srli_epi32(byte_stream, 12); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); - byte_stream = _mm_srli_epi32(byte_stream_2, 8); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); - byte_stream = _mm_srli_epi32(byte_stream, 12); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); - in += 32; - to += 20; - case 0xb5: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); - byte_stream = _mm_srli_epi32(byte_stream, 12); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); - byte_stream = _mm_srli_epi32(byte_stream_2, 8); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); - byte_stream = _mm_srli_epi32(byte_stream, 12); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); - in += 32; - to += 20; - case 0xb6: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); - byte_stream = _mm_srli_epi32(byte_stream, 12); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); - byte_stream = _mm_srli_epi32(byte_stream_2, 8); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); - byte_stream = _mm_srli_epi32(byte_stream, 12); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); - in += 32; - to += 20; - case 0xb7: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); - byte_stream = _mm_srli_epi32(byte_stream, 12); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); - byte_stream = _mm_srli_epi32(byte_stream_2, 8); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); - byte_stream = _mm_srli_epi32(byte_stream, 12); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); - in += 32; - to += 20; - case 0xb8: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); - byte_stream = _mm_srli_epi32(byte_stream, 12); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); - byte_stream = _mm_srli_epi32(byte_stream_2, 8); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); - byte_stream = _mm_srli_epi32(byte_stream, 12); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); - in += 32; - to += 20; - case 0xb9: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); - byte_stream = _mm_srli_epi32(byte_stream, 12); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); - byte_stream = _mm_srli_epi32(byte_stream_2, 8); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); - byte_stream = _mm_srli_epi32(byte_stream, 12); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); - in += 32; - to += 20; - case 0xba: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); - byte_stream = _mm_srli_epi32(byte_stream, 12); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); - byte_stream = _mm_srli_epi32(byte_stream_2, 8); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); - byte_stream = _mm_srli_epi32(byte_stream, 12); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); - in += 32; - to += 20; - case 0xbb: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); - byte_stream = _mm_srli_epi32(byte_stream, 12); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); - byte_stream = _mm_srli_epi32(byte_stream_2, 8); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); - byte_stream = _mm_srli_epi32(byte_stream, 12); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); - in += 32; - to += 20; - case 0xbc: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); - byte_stream = _mm_srli_epi32(byte_stream, 12); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); - byte_stream = _mm_srli_epi32(byte_stream_2, 8); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); - byte_stream = _mm_srli_epi32(byte_stream, 12); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); - in += 32; - to += 20; - case 0xbd: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); - byte_stream = _mm_srli_epi32(byte_stream, 12); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); - byte_stream = _mm_srli_epi32(byte_stream_2, 8); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); - byte_stream = _mm_srli_epi32(byte_stream, 12); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); - in += 32; - to += 20; - case 0xbe: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); - byte_stream = _mm_srli_epi32(byte_stream, 12); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); - byte_stream = _mm_srli_epi32(byte_stream_2, 8); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); - byte_stream = _mm_srli_epi32(byte_stream, 12); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); - in += 32; - to += 20; - case 0xbf: - byte_stream = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); - byte_stream = _mm_srli_epi32(byte_stream, 12); - _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); - byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); - _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); - byte_stream = _mm_srli_epi32(byte_stream_2, 8); - _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); - byte_stream = _mm_srli_epi32(byte_stream, 12); - _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); - in += 32; - to += 20; - break; - case 0xc0: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); - _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); - in += 16; - to += 8; - case 0xc1: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); - _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); - in += 16; - to += 8; - case 0xc2: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); - _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); - in += 16; - to += 8; - case 0xc3: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); - _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); - in += 16; - to += 8; - case 0xc4: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); - _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); - in += 16; - to += 8; - case 0xc5: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); - _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); - in += 16; - to += 8; - case 0xc6: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); - _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); - in += 16; - to += 8; - case 0xc7: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); - _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); - in += 16; - to += 8; - case 0xc8: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); - _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); - in += 16; - to += 8; - case 0xc9: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); - _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); - in += 16; - to += 8; - case 0xca: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); - _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); - in += 16; - to += 8; - case 0xcb: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); - _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); - in += 16; - to += 8; - case 0xcc: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); - _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); - in += 16; - to += 8; - case 0xcd: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); - _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); - in += 16; - to += 8; - case 0xce: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); - _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); - in += 16; - to += 8; - case 0xcf: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); - _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); - in += 16; - to += 8; - break; - case 0xd0: - byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; - to += 12; - case 0xd1: - byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; - to += 12; - case 0xd2: - byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; - to += 12; - case 0xd3: - byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; - to += 12; - case 0xd4: - byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; - to += 12; - case 0xd5: - byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; - to += 12; - case 0xd6: - byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; - to += 12; - case 0xd7: - byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; - to += 12; - case 0xd8: - byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; - to += 12; - case 0xd9: - byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; - to += 12; - case 0xda: - byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; - to += 12; - case 0xdb: - byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; - to += 12; - case 0xdc: - byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; - to += 12; - case 0xdd: - byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; - to += 12; - case 0xde: - byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; - to += 12; - case 0xdf: - byte_stream = _mm_loadu_si128((__m128i *)in); _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); in += 32; - to += 12; - break; - case 0xe0: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, tmp); - in += 16; - to += 4; - case 0xe1: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, tmp); - in += 16; - to += 4; - case 0xe2: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, tmp); - in += 16; - to += 4; - case 0xe3: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, tmp); - in += 16; - to += 4; - case 0xe4: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, tmp); - in += 16; - to += 4; - case 0xe5: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, tmp); - in += 16; - to += 4; - case 0xe6: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, tmp); - in += 16; - to += 4; - case 0xe7: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, tmp); - in += 16; - to += 4; - case 0xe8: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, tmp); - in += 16; - to += 4; - case 0xe9: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, tmp); - in += 16; - to += 4; - case 0xea: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, tmp); - in += 16; - to += 4; - case 0xeb: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, tmp); - in += 16; - to += 4; - case 0xec: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, tmp); - in += 16; - to += 4; - case 0xed: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, tmp); - in += 16; - to += 4; - case 0xee: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, tmp); - in += 16; - to += 4; - case 0xef: - tmp = _mm_loadu_si128((__m128i *)in); - _mm_storeu_si128((__m128i *)to, tmp); - in += 16; - to += 4; - break; - case 0xf0: - in++; - case 0xf1: - in++; - case 0xf2: - in++; - case 0xf3: - in++; - case 0xf4: - in++; - case 0xf5: - in++; - case 0xf6: - in++; - case 0xf7: - in++; - case 0xf8: - in++; - case 0xf9: - in++; - case 0xfa: - in++; - case 0xfb: - in++; - case 0xfc: - in++; - case 0xfd: - in++; - case 0xfe: - in++; - case 0xff: - in++; - break; - } - } -} - -unsigned char *qmx_enc( const uint32_t *in, unsigned n, unsigned char *out) -{ compress_qmx compressor; - uint64_t r; - compressor.encodeArray(in, n, (uint32_t *)out, &r); - return out + r; -} - -unsigned char *qmx_dec(const unsigned char *in, unsigned len, uint32_t *out, unsigned n) -{ compress_qmx compressor; - compressor.decodeArray((uint32_t *)in, len, out, n); - return (unsigned char *)in + len; -} +static uint32_t ALIGN_16 static_mask_21[] = {0x1fffff, 0x1fffff, 0x1fffff, 0x1fffff}; +static uint32_t ALIGN_16 static_mask_12[] = {0xfff, 0xfff, 0xfff, 0xfff}; +static uint32_t ALIGN_16 static_mask_10[] = {0x3ff, 0x3ff, 0x3ff, 0x3ff}; +static uint32_t ALIGN_16 static_mask_9[] = {0x1ff, 0x1ff, 0x1ff, 0x1ff}; +static uint32_t ALIGN_16 static_mask_7[] = {0x7f, 0x7f, 0x7f, 0x7f}; +static uint32_t ALIGN_16 static_mask_6[] = {0x3f, 0x3f, 0x3f, 0x3f}; +static uint32_t ALIGN_16 static_mask_5[] = {0x1f, 0x1f, 0x1f, 0x1f}; +static uint32_t ALIGN_16 static_mask_4[] = {0x0f, 0x0f, 0x0f, 0x0f}; +static uint32_t ALIGN_16 static_mask_3[] = {0x07, 0x07, 0x07, 0x07}; +static uint32_t ALIGN_16 static_mask_2[] = {0x03, 0x03, 0x03, 0x03}; +static uint32_t ALIGN_16 static_mask_1[] = {0x01, 0x01, 0x01, 0x01}; +void ANT_compress_qmx::decodeArray(const uint32_t *source, uint64_t len, uint32_t *to, uint64_t destination_integers) +{ +__m128i byte_stream, byte_stream_2, tmp, tmp2, mask_21, mask_12, mask_10, mask_9, mask_7, mask_6, mask_5, mask_4, mask_3, mask_2, mask_1; +uint8_t *in = (uint8_t *)source; +uint32_t *end = to + destination_integers; +uint32_t key_start = vbyte_decompress((uint8_t *)source + len - 1); +uint8_t *keys = (uint8_t *)source + len - key_start; + +mask_21 = _mm_loadu_si128((__m128i *)static_mask_21); +mask_12 = _mm_loadu_si128((__m128i *)static_mask_12); +mask_10 = _mm_loadu_si128((__m128i *)static_mask_10); +mask_9 = _mm_loadu_si128((__m128i *)static_mask_9); +mask_7 = _mm_loadu_si128((__m128i *)static_mask_7); +mask_6 = _mm_loadu_si128((__m128i *)static_mask_6); +mask_5 = _mm_loadu_si128((__m128i *)static_mask_5); +mask_4 = _mm_loadu_si128((__m128i *)static_mask_4); +mask_3 = _mm_loadu_si128((__m128i *)static_mask_3); +mask_2 = _mm_loadu_si128((__m128i *)static_mask_2); +mask_1 = _mm_loadu_si128((__m128i *)static_mask_1); + +while (to < end) + { + switch (*keys++) + { + case 0x00: +#ifdef NO_ZEROS + tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_storeu_si128((__m128i *)to, tmp); + _mm_storeu_si128((__m128i *)to + 1, tmp); + _mm_storeu_si128((__m128i *)to + 2, tmp); + _mm_storeu_si128((__m128i *)to + 3, tmp); + _mm_storeu_si128((__m128i *)to + 4, tmp); + _mm_storeu_si128((__m128i *)to + 5, tmp); + _mm_storeu_si128((__m128i *)to + 6, tmp); + _mm_storeu_si128((__m128i *)to + 7, tmp); + _mm_storeu_si128((__m128i *)to + 8, tmp); + _mm_storeu_si128((__m128i *)to + 9, tmp); + _mm_storeu_si128((__m128i *)to + 10, tmp); + _mm_storeu_si128((__m128i *)to + 11, tmp); + _mm_storeu_si128((__m128i *)to + 12, tmp); + _mm_storeu_si128((__m128i *)to + 13, tmp); + _mm_storeu_si128((__m128i *)to + 14, tmp); + _mm_storeu_si128((__m128i *)to + 15, tmp); + _mm_storeu_si128((__m128i *)to + 16, tmp); + _mm_storeu_si128((__m128i *)to + 17, tmp); + _mm_storeu_si128((__m128i *)to + 18, tmp); + _mm_storeu_si128((__m128i *)to + 19, tmp); + _mm_storeu_si128((__m128i *)to + 20, tmp); + _mm_storeu_si128((__m128i *)to + 21, tmp); + _mm_storeu_si128((__m128i *)to + 22, tmp); + _mm_storeu_si128((__m128i *)to + 23, tmp); + _mm_storeu_si128((__m128i *)to + 24, tmp); + _mm_storeu_si128((__m128i *)to + 25, tmp); + _mm_storeu_si128((__m128i *)to + 26, tmp); + _mm_storeu_si128((__m128i *)to + 27, tmp); + _mm_storeu_si128((__m128i *)to + 28, tmp); + _mm_storeu_si128((__m128i *)to + 29, tmp); + _mm_storeu_si128((__m128i *)to + 30, tmp); + _mm_storeu_si128((__m128i *)to + 31, tmp); + _mm_storeu_si128((__m128i *)to + 32, tmp); + _mm_storeu_si128((__m128i *)to + 33, tmp); + _mm_storeu_si128((__m128i *)to + 34, tmp); + _mm_storeu_si128((__m128i *)to + 35, tmp); + _mm_storeu_si128((__m128i *)to + 36, tmp); + _mm_storeu_si128((__m128i *)to + 37, tmp); + _mm_storeu_si128((__m128i *)to + 38, tmp); + _mm_storeu_si128((__m128i *)to + 39, tmp); + _mm_storeu_si128((__m128i *)to + 40, tmp); + _mm_storeu_si128((__m128i *)to + 41, tmp); + _mm_storeu_si128((__m128i *)to + 42, tmp); + _mm_storeu_si128((__m128i *)to + 43, tmp); + _mm_storeu_si128((__m128i *)to + 44, tmp); + _mm_storeu_si128((__m128i *)to + 45, tmp); + _mm_storeu_si128((__m128i *)to + 46, tmp); + _mm_storeu_si128((__m128i *)to + 47, tmp); + _mm_storeu_si128((__m128i *)to + 48, tmp); + _mm_storeu_si128((__m128i *)to + 49, tmp); + _mm_storeu_si128((__m128i *)to + 50, tmp); + _mm_storeu_si128((__m128i *)to + 51, tmp); + _mm_storeu_si128((__m128i *)to + 52, tmp); + _mm_storeu_si128((__m128i *)to + 53, tmp); + _mm_storeu_si128((__m128i *)to + 54, tmp); + _mm_storeu_si128((__m128i *)to + 55, tmp); + _mm_storeu_si128((__m128i *)to + 56, tmp); + _mm_storeu_si128((__m128i *)to + 57, tmp); + _mm_storeu_si128((__m128i *)to + 58, tmp); + _mm_storeu_si128((__m128i *)to + 59, tmp); + _mm_storeu_si128((__m128i *)to + 60, tmp); + _mm_storeu_si128((__m128i *)to + 61, tmp); + _mm_storeu_si128((__m128i *)to + 62, tmp); + _mm_storeu_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x01: +#ifdef NO_ZEROS + tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_storeu_si128((__m128i *)to, tmp); + _mm_storeu_si128((__m128i *)to + 1, tmp); + _mm_storeu_si128((__m128i *)to + 2, tmp); + _mm_storeu_si128((__m128i *)to + 3, tmp); + _mm_storeu_si128((__m128i *)to + 4, tmp); + _mm_storeu_si128((__m128i *)to + 5, tmp); + _mm_storeu_si128((__m128i *)to + 6, tmp); + _mm_storeu_si128((__m128i *)to + 7, tmp); + _mm_storeu_si128((__m128i *)to + 8, tmp); + _mm_storeu_si128((__m128i *)to + 9, tmp); + _mm_storeu_si128((__m128i *)to + 10, tmp); + _mm_storeu_si128((__m128i *)to + 11, tmp); + _mm_storeu_si128((__m128i *)to + 12, tmp); + _mm_storeu_si128((__m128i *)to + 13, tmp); + _mm_storeu_si128((__m128i *)to + 14, tmp); + _mm_storeu_si128((__m128i *)to + 15, tmp); + _mm_storeu_si128((__m128i *)to + 16, tmp); + _mm_storeu_si128((__m128i *)to + 17, tmp); + _mm_storeu_si128((__m128i *)to + 18, tmp); + _mm_storeu_si128((__m128i *)to + 19, tmp); + _mm_storeu_si128((__m128i *)to + 20, tmp); + _mm_storeu_si128((__m128i *)to + 21, tmp); + _mm_storeu_si128((__m128i *)to + 22, tmp); + _mm_storeu_si128((__m128i *)to + 23, tmp); + _mm_storeu_si128((__m128i *)to + 24, tmp); + _mm_storeu_si128((__m128i *)to + 25, tmp); + _mm_storeu_si128((__m128i *)to + 26, tmp); + _mm_storeu_si128((__m128i *)to + 27, tmp); + _mm_storeu_si128((__m128i *)to + 28, tmp); + _mm_storeu_si128((__m128i *)to + 29, tmp); + _mm_storeu_si128((__m128i *)to + 30, tmp); + _mm_storeu_si128((__m128i *)to + 31, tmp); + _mm_storeu_si128((__m128i *)to + 32, tmp); + _mm_storeu_si128((__m128i *)to + 33, tmp); + _mm_storeu_si128((__m128i *)to + 34, tmp); + _mm_storeu_si128((__m128i *)to + 35, tmp); + _mm_storeu_si128((__m128i *)to + 36, tmp); + _mm_storeu_si128((__m128i *)to + 37, tmp); + _mm_storeu_si128((__m128i *)to + 38, tmp); + _mm_storeu_si128((__m128i *)to + 39, tmp); + _mm_storeu_si128((__m128i *)to + 40, tmp); + _mm_storeu_si128((__m128i *)to + 41, tmp); + _mm_storeu_si128((__m128i *)to + 42, tmp); + _mm_storeu_si128((__m128i *)to + 43, tmp); + _mm_storeu_si128((__m128i *)to + 44, tmp); + _mm_storeu_si128((__m128i *)to + 45, tmp); + _mm_storeu_si128((__m128i *)to + 46, tmp); + _mm_storeu_si128((__m128i *)to + 47, tmp); + _mm_storeu_si128((__m128i *)to + 48, tmp); + _mm_storeu_si128((__m128i *)to + 49, tmp); + _mm_storeu_si128((__m128i *)to + 50, tmp); + _mm_storeu_si128((__m128i *)to + 51, tmp); + _mm_storeu_si128((__m128i *)to + 52, tmp); + _mm_storeu_si128((__m128i *)to + 53, tmp); + _mm_storeu_si128((__m128i *)to + 54, tmp); + _mm_storeu_si128((__m128i *)to + 55, tmp); + _mm_storeu_si128((__m128i *)to + 56, tmp); + _mm_storeu_si128((__m128i *)to + 57, tmp); + _mm_storeu_si128((__m128i *)to + 58, tmp); + _mm_storeu_si128((__m128i *)to + 59, tmp); + _mm_storeu_si128((__m128i *)to + 60, tmp); + _mm_storeu_si128((__m128i *)to + 61, tmp); + _mm_storeu_si128((__m128i *)to + 62, tmp); + _mm_storeu_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x02: +#ifdef NO_ZEROS + tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_storeu_si128((__m128i *)to, tmp); + _mm_storeu_si128((__m128i *)to + 1, tmp); + _mm_storeu_si128((__m128i *)to + 2, tmp); + _mm_storeu_si128((__m128i *)to + 3, tmp); + _mm_storeu_si128((__m128i *)to + 4, tmp); + _mm_storeu_si128((__m128i *)to + 5, tmp); + _mm_storeu_si128((__m128i *)to + 6, tmp); + _mm_storeu_si128((__m128i *)to + 7, tmp); + _mm_storeu_si128((__m128i *)to + 8, tmp); + _mm_storeu_si128((__m128i *)to + 9, tmp); + _mm_storeu_si128((__m128i *)to + 10, tmp); + _mm_storeu_si128((__m128i *)to + 11, tmp); + _mm_storeu_si128((__m128i *)to + 12, tmp); + _mm_storeu_si128((__m128i *)to + 13, tmp); + _mm_storeu_si128((__m128i *)to + 14, tmp); + _mm_storeu_si128((__m128i *)to + 15, tmp); + _mm_storeu_si128((__m128i *)to + 16, tmp); + _mm_storeu_si128((__m128i *)to + 17, tmp); + _mm_storeu_si128((__m128i *)to + 18, tmp); + _mm_storeu_si128((__m128i *)to + 19, tmp); + _mm_storeu_si128((__m128i *)to + 20, tmp); + _mm_storeu_si128((__m128i *)to + 21, tmp); + _mm_storeu_si128((__m128i *)to + 22, tmp); + _mm_storeu_si128((__m128i *)to + 23, tmp); + _mm_storeu_si128((__m128i *)to + 24, tmp); + _mm_storeu_si128((__m128i *)to + 25, tmp); + _mm_storeu_si128((__m128i *)to + 26, tmp); + _mm_storeu_si128((__m128i *)to + 27, tmp); + _mm_storeu_si128((__m128i *)to + 28, tmp); + _mm_storeu_si128((__m128i *)to + 29, tmp); + _mm_storeu_si128((__m128i *)to + 30, tmp); + _mm_storeu_si128((__m128i *)to + 31, tmp); + _mm_storeu_si128((__m128i *)to + 32, tmp); + _mm_storeu_si128((__m128i *)to + 33, tmp); + _mm_storeu_si128((__m128i *)to + 34, tmp); + _mm_storeu_si128((__m128i *)to + 35, tmp); + _mm_storeu_si128((__m128i *)to + 36, tmp); + _mm_storeu_si128((__m128i *)to + 37, tmp); + _mm_storeu_si128((__m128i *)to + 38, tmp); + _mm_storeu_si128((__m128i *)to + 39, tmp); + _mm_storeu_si128((__m128i *)to + 40, tmp); + _mm_storeu_si128((__m128i *)to + 41, tmp); + _mm_storeu_si128((__m128i *)to + 42, tmp); + _mm_storeu_si128((__m128i *)to + 43, tmp); + _mm_storeu_si128((__m128i *)to + 44, tmp); + _mm_storeu_si128((__m128i *)to + 45, tmp); + _mm_storeu_si128((__m128i *)to + 46, tmp); + _mm_storeu_si128((__m128i *)to + 47, tmp); + _mm_storeu_si128((__m128i *)to + 48, tmp); + _mm_storeu_si128((__m128i *)to + 49, tmp); + _mm_storeu_si128((__m128i *)to + 50, tmp); + _mm_storeu_si128((__m128i *)to + 51, tmp); + _mm_storeu_si128((__m128i *)to + 52, tmp); + _mm_storeu_si128((__m128i *)to + 53, tmp); + _mm_storeu_si128((__m128i *)to + 54, tmp); + _mm_storeu_si128((__m128i *)to + 55, tmp); + _mm_storeu_si128((__m128i *)to + 56, tmp); + _mm_storeu_si128((__m128i *)to + 57, tmp); + _mm_storeu_si128((__m128i *)to + 58, tmp); + _mm_storeu_si128((__m128i *)to + 59, tmp); + _mm_storeu_si128((__m128i *)to + 60, tmp); + _mm_storeu_si128((__m128i *)to + 61, tmp); + _mm_storeu_si128((__m128i *)to + 62, tmp); + _mm_storeu_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x03: +#ifdef NO_ZEROS + tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_storeu_si128((__m128i *)to, tmp); + _mm_storeu_si128((__m128i *)to + 1, tmp); + _mm_storeu_si128((__m128i *)to + 2, tmp); + _mm_storeu_si128((__m128i *)to + 3, tmp); + _mm_storeu_si128((__m128i *)to + 4, tmp); + _mm_storeu_si128((__m128i *)to + 5, tmp); + _mm_storeu_si128((__m128i *)to + 6, tmp); + _mm_storeu_si128((__m128i *)to + 7, tmp); + _mm_storeu_si128((__m128i *)to + 8, tmp); + _mm_storeu_si128((__m128i *)to + 9, tmp); + _mm_storeu_si128((__m128i *)to + 10, tmp); + _mm_storeu_si128((__m128i *)to + 11, tmp); + _mm_storeu_si128((__m128i *)to + 12, tmp); + _mm_storeu_si128((__m128i *)to + 13, tmp); + _mm_storeu_si128((__m128i *)to + 14, tmp); + _mm_storeu_si128((__m128i *)to + 15, tmp); + _mm_storeu_si128((__m128i *)to + 16, tmp); + _mm_storeu_si128((__m128i *)to + 17, tmp); + _mm_storeu_si128((__m128i *)to + 18, tmp); + _mm_storeu_si128((__m128i *)to + 19, tmp); + _mm_storeu_si128((__m128i *)to + 20, tmp); + _mm_storeu_si128((__m128i *)to + 21, tmp); + _mm_storeu_si128((__m128i *)to + 22, tmp); + _mm_storeu_si128((__m128i *)to + 23, tmp); + _mm_storeu_si128((__m128i *)to + 24, tmp); + _mm_storeu_si128((__m128i *)to + 25, tmp); + _mm_storeu_si128((__m128i *)to + 26, tmp); + _mm_storeu_si128((__m128i *)to + 27, tmp); + _mm_storeu_si128((__m128i *)to + 28, tmp); + _mm_storeu_si128((__m128i *)to + 29, tmp); + _mm_storeu_si128((__m128i *)to + 30, tmp); + _mm_storeu_si128((__m128i *)to + 31, tmp); + _mm_storeu_si128((__m128i *)to + 32, tmp); + _mm_storeu_si128((__m128i *)to + 33, tmp); + _mm_storeu_si128((__m128i *)to + 34, tmp); + _mm_storeu_si128((__m128i *)to + 35, tmp); + _mm_storeu_si128((__m128i *)to + 36, tmp); + _mm_storeu_si128((__m128i *)to + 37, tmp); + _mm_storeu_si128((__m128i *)to + 38, tmp); + _mm_storeu_si128((__m128i *)to + 39, tmp); + _mm_storeu_si128((__m128i *)to + 40, tmp); + _mm_storeu_si128((__m128i *)to + 41, tmp); + _mm_storeu_si128((__m128i *)to + 42, tmp); + _mm_storeu_si128((__m128i *)to + 43, tmp); + _mm_storeu_si128((__m128i *)to + 44, tmp); + _mm_storeu_si128((__m128i *)to + 45, tmp); + _mm_storeu_si128((__m128i *)to + 46, tmp); + _mm_storeu_si128((__m128i *)to + 47, tmp); + _mm_storeu_si128((__m128i *)to + 48, tmp); + _mm_storeu_si128((__m128i *)to + 49, tmp); + _mm_storeu_si128((__m128i *)to + 50, tmp); + _mm_storeu_si128((__m128i *)to + 51, tmp); + _mm_storeu_si128((__m128i *)to + 52, tmp); + _mm_storeu_si128((__m128i *)to + 53, tmp); + _mm_storeu_si128((__m128i *)to + 54, tmp); + _mm_storeu_si128((__m128i *)to + 55, tmp); + _mm_storeu_si128((__m128i *)to + 56, tmp); + _mm_storeu_si128((__m128i *)to + 57, tmp); + _mm_storeu_si128((__m128i *)to + 58, tmp); + _mm_storeu_si128((__m128i *)to + 59, tmp); + _mm_storeu_si128((__m128i *)to + 60, tmp); + _mm_storeu_si128((__m128i *)to + 61, tmp); + _mm_storeu_si128((__m128i *)to + 62, tmp); + _mm_storeu_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x04: +#ifdef NO_ZEROS + tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_storeu_si128((__m128i *)to, tmp); + _mm_storeu_si128((__m128i *)to + 1, tmp); + _mm_storeu_si128((__m128i *)to + 2, tmp); + _mm_storeu_si128((__m128i *)to + 3, tmp); + _mm_storeu_si128((__m128i *)to + 4, tmp); + _mm_storeu_si128((__m128i *)to + 5, tmp); + _mm_storeu_si128((__m128i *)to + 6, tmp); + _mm_storeu_si128((__m128i *)to + 7, tmp); + _mm_storeu_si128((__m128i *)to + 8, tmp); + _mm_storeu_si128((__m128i *)to + 9, tmp); + _mm_storeu_si128((__m128i *)to + 10, tmp); + _mm_storeu_si128((__m128i *)to + 11, tmp); + _mm_storeu_si128((__m128i *)to + 12, tmp); + _mm_storeu_si128((__m128i *)to + 13, tmp); + _mm_storeu_si128((__m128i *)to + 14, tmp); + _mm_storeu_si128((__m128i *)to + 15, tmp); + _mm_storeu_si128((__m128i *)to + 16, tmp); + _mm_storeu_si128((__m128i *)to + 17, tmp); + _mm_storeu_si128((__m128i *)to + 18, tmp); + _mm_storeu_si128((__m128i *)to + 19, tmp); + _mm_storeu_si128((__m128i *)to + 20, tmp); + _mm_storeu_si128((__m128i *)to + 21, tmp); + _mm_storeu_si128((__m128i *)to + 22, tmp); + _mm_storeu_si128((__m128i *)to + 23, tmp); + _mm_storeu_si128((__m128i *)to + 24, tmp); + _mm_storeu_si128((__m128i *)to + 25, tmp); + _mm_storeu_si128((__m128i *)to + 26, tmp); + _mm_storeu_si128((__m128i *)to + 27, tmp); + _mm_storeu_si128((__m128i *)to + 28, tmp); + _mm_storeu_si128((__m128i *)to + 29, tmp); + _mm_storeu_si128((__m128i *)to + 30, tmp); + _mm_storeu_si128((__m128i *)to + 31, tmp); + _mm_storeu_si128((__m128i *)to + 32, tmp); + _mm_storeu_si128((__m128i *)to + 33, tmp); + _mm_storeu_si128((__m128i *)to + 34, tmp); + _mm_storeu_si128((__m128i *)to + 35, tmp); + _mm_storeu_si128((__m128i *)to + 36, tmp); + _mm_storeu_si128((__m128i *)to + 37, tmp); + _mm_storeu_si128((__m128i *)to + 38, tmp); + _mm_storeu_si128((__m128i *)to + 39, tmp); + _mm_storeu_si128((__m128i *)to + 40, tmp); + _mm_storeu_si128((__m128i *)to + 41, tmp); + _mm_storeu_si128((__m128i *)to + 42, tmp); + _mm_storeu_si128((__m128i *)to + 43, tmp); + _mm_storeu_si128((__m128i *)to + 44, tmp); + _mm_storeu_si128((__m128i *)to + 45, tmp); + _mm_storeu_si128((__m128i *)to + 46, tmp); + _mm_storeu_si128((__m128i *)to + 47, tmp); + _mm_storeu_si128((__m128i *)to + 48, tmp); + _mm_storeu_si128((__m128i *)to + 49, tmp); + _mm_storeu_si128((__m128i *)to + 50, tmp); + _mm_storeu_si128((__m128i *)to + 51, tmp); + _mm_storeu_si128((__m128i *)to + 52, tmp); + _mm_storeu_si128((__m128i *)to + 53, tmp); + _mm_storeu_si128((__m128i *)to + 54, tmp); + _mm_storeu_si128((__m128i *)to + 55, tmp); + _mm_storeu_si128((__m128i *)to + 56, tmp); + _mm_storeu_si128((__m128i *)to + 57, tmp); + _mm_storeu_si128((__m128i *)to + 58, tmp); + _mm_storeu_si128((__m128i *)to + 59, tmp); + _mm_storeu_si128((__m128i *)to + 60, tmp); + _mm_storeu_si128((__m128i *)to + 61, tmp); + _mm_storeu_si128((__m128i *)to + 62, tmp); + _mm_storeu_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x05: +#ifdef NO_ZEROS + tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_storeu_si128((__m128i *)to, tmp); + _mm_storeu_si128((__m128i *)to + 1, tmp); + _mm_storeu_si128((__m128i *)to + 2, tmp); + _mm_storeu_si128((__m128i *)to + 3, tmp); + _mm_storeu_si128((__m128i *)to + 4, tmp); + _mm_storeu_si128((__m128i *)to + 5, tmp); + _mm_storeu_si128((__m128i *)to + 6, tmp); + _mm_storeu_si128((__m128i *)to + 7, tmp); + _mm_storeu_si128((__m128i *)to + 8, tmp); + _mm_storeu_si128((__m128i *)to + 9, tmp); + _mm_storeu_si128((__m128i *)to + 10, tmp); + _mm_storeu_si128((__m128i *)to + 11, tmp); + _mm_storeu_si128((__m128i *)to + 12, tmp); + _mm_storeu_si128((__m128i *)to + 13, tmp); + _mm_storeu_si128((__m128i *)to + 14, tmp); + _mm_storeu_si128((__m128i *)to + 15, tmp); + _mm_storeu_si128((__m128i *)to + 16, tmp); + _mm_storeu_si128((__m128i *)to + 17, tmp); + _mm_storeu_si128((__m128i *)to + 18, tmp); + _mm_storeu_si128((__m128i *)to + 19, tmp); + _mm_storeu_si128((__m128i *)to + 20, tmp); + _mm_storeu_si128((__m128i *)to + 21, tmp); + _mm_storeu_si128((__m128i *)to + 22, tmp); + _mm_storeu_si128((__m128i *)to + 23, tmp); + _mm_storeu_si128((__m128i *)to + 24, tmp); + _mm_storeu_si128((__m128i *)to + 25, tmp); + _mm_storeu_si128((__m128i *)to + 26, tmp); + _mm_storeu_si128((__m128i *)to + 27, tmp); + _mm_storeu_si128((__m128i *)to + 28, tmp); + _mm_storeu_si128((__m128i *)to + 29, tmp); + _mm_storeu_si128((__m128i *)to + 30, tmp); + _mm_storeu_si128((__m128i *)to + 31, tmp); + _mm_storeu_si128((__m128i *)to + 32, tmp); + _mm_storeu_si128((__m128i *)to + 33, tmp); + _mm_storeu_si128((__m128i *)to + 34, tmp); + _mm_storeu_si128((__m128i *)to + 35, tmp); + _mm_storeu_si128((__m128i *)to + 36, tmp); + _mm_storeu_si128((__m128i *)to + 37, tmp); + _mm_storeu_si128((__m128i *)to + 38, tmp); + _mm_storeu_si128((__m128i *)to + 39, tmp); + _mm_storeu_si128((__m128i *)to + 40, tmp); + _mm_storeu_si128((__m128i *)to + 41, tmp); + _mm_storeu_si128((__m128i *)to + 42, tmp); + _mm_storeu_si128((__m128i *)to + 43, tmp); + _mm_storeu_si128((__m128i *)to + 44, tmp); + _mm_storeu_si128((__m128i *)to + 45, tmp); + _mm_storeu_si128((__m128i *)to + 46, tmp); + _mm_storeu_si128((__m128i *)to + 47, tmp); + _mm_storeu_si128((__m128i *)to + 48, tmp); + _mm_storeu_si128((__m128i *)to + 49, tmp); + _mm_storeu_si128((__m128i *)to + 50, tmp); + _mm_storeu_si128((__m128i *)to + 51, tmp); + _mm_storeu_si128((__m128i *)to + 52, tmp); + _mm_storeu_si128((__m128i *)to + 53, tmp); + _mm_storeu_si128((__m128i *)to + 54, tmp); + _mm_storeu_si128((__m128i *)to + 55, tmp); + _mm_storeu_si128((__m128i *)to + 56, tmp); + _mm_storeu_si128((__m128i *)to + 57, tmp); + _mm_storeu_si128((__m128i *)to + 58, tmp); + _mm_storeu_si128((__m128i *)to + 59, tmp); + _mm_storeu_si128((__m128i *)to + 60, tmp); + _mm_storeu_si128((__m128i *)to + 61, tmp); + _mm_storeu_si128((__m128i *)to + 62, tmp); + _mm_storeu_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x06: +#ifdef NO_ZEROS + tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_storeu_si128((__m128i *)to, tmp); + _mm_storeu_si128((__m128i *)to + 1, tmp); + _mm_storeu_si128((__m128i *)to + 2, tmp); + _mm_storeu_si128((__m128i *)to + 3, tmp); + _mm_storeu_si128((__m128i *)to + 4, tmp); + _mm_storeu_si128((__m128i *)to + 5, tmp); + _mm_storeu_si128((__m128i *)to + 6, tmp); + _mm_storeu_si128((__m128i *)to + 7, tmp); + _mm_storeu_si128((__m128i *)to + 8, tmp); + _mm_storeu_si128((__m128i *)to + 9, tmp); + _mm_storeu_si128((__m128i *)to + 10, tmp); + _mm_storeu_si128((__m128i *)to + 11, tmp); + _mm_storeu_si128((__m128i *)to + 12, tmp); + _mm_storeu_si128((__m128i *)to + 13, tmp); + _mm_storeu_si128((__m128i *)to + 14, tmp); + _mm_storeu_si128((__m128i *)to + 15, tmp); + _mm_storeu_si128((__m128i *)to + 16, tmp); + _mm_storeu_si128((__m128i *)to + 17, tmp); + _mm_storeu_si128((__m128i *)to + 18, tmp); + _mm_storeu_si128((__m128i *)to + 19, tmp); + _mm_storeu_si128((__m128i *)to + 20, tmp); + _mm_storeu_si128((__m128i *)to + 21, tmp); + _mm_storeu_si128((__m128i *)to + 22, tmp); + _mm_storeu_si128((__m128i *)to + 23, tmp); + _mm_storeu_si128((__m128i *)to + 24, tmp); + _mm_storeu_si128((__m128i *)to + 25, tmp); + _mm_storeu_si128((__m128i *)to + 26, tmp); + _mm_storeu_si128((__m128i *)to + 27, tmp); + _mm_storeu_si128((__m128i *)to + 28, tmp); + _mm_storeu_si128((__m128i *)to + 29, tmp); + _mm_storeu_si128((__m128i *)to + 30, tmp); + _mm_storeu_si128((__m128i *)to + 31, tmp); + _mm_storeu_si128((__m128i *)to + 32, tmp); + _mm_storeu_si128((__m128i *)to + 33, tmp); + _mm_storeu_si128((__m128i *)to + 34, tmp); + _mm_storeu_si128((__m128i *)to + 35, tmp); + _mm_storeu_si128((__m128i *)to + 36, tmp); + _mm_storeu_si128((__m128i *)to + 37, tmp); + _mm_storeu_si128((__m128i *)to + 38, tmp); + _mm_storeu_si128((__m128i *)to + 39, tmp); + _mm_storeu_si128((__m128i *)to + 40, tmp); + _mm_storeu_si128((__m128i *)to + 41, tmp); + _mm_storeu_si128((__m128i *)to + 42, tmp); + _mm_storeu_si128((__m128i *)to + 43, tmp); + _mm_storeu_si128((__m128i *)to + 44, tmp); + _mm_storeu_si128((__m128i *)to + 45, tmp); + _mm_storeu_si128((__m128i *)to + 46, tmp); + _mm_storeu_si128((__m128i *)to + 47, tmp); + _mm_storeu_si128((__m128i *)to + 48, tmp); + _mm_storeu_si128((__m128i *)to + 49, tmp); + _mm_storeu_si128((__m128i *)to + 50, tmp); + _mm_storeu_si128((__m128i *)to + 51, tmp); + _mm_storeu_si128((__m128i *)to + 52, tmp); + _mm_storeu_si128((__m128i *)to + 53, tmp); + _mm_storeu_si128((__m128i *)to + 54, tmp); + _mm_storeu_si128((__m128i *)to + 55, tmp); + _mm_storeu_si128((__m128i *)to + 56, tmp); + _mm_storeu_si128((__m128i *)to + 57, tmp); + _mm_storeu_si128((__m128i *)to + 58, tmp); + _mm_storeu_si128((__m128i *)to + 59, tmp); + _mm_storeu_si128((__m128i *)to + 60, tmp); + _mm_storeu_si128((__m128i *)to + 61, tmp); + _mm_storeu_si128((__m128i *)to + 62, tmp); + _mm_storeu_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x07: +#ifdef NO_ZEROS + tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_storeu_si128((__m128i *)to, tmp); + _mm_storeu_si128((__m128i *)to + 1, tmp); + _mm_storeu_si128((__m128i *)to + 2, tmp); + _mm_storeu_si128((__m128i *)to + 3, tmp); + _mm_storeu_si128((__m128i *)to + 4, tmp); + _mm_storeu_si128((__m128i *)to + 5, tmp); + _mm_storeu_si128((__m128i *)to + 6, tmp); + _mm_storeu_si128((__m128i *)to + 7, tmp); + _mm_storeu_si128((__m128i *)to + 8, tmp); + _mm_storeu_si128((__m128i *)to + 9, tmp); + _mm_storeu_si128((__m128i *)to + 10, tmp); + _mm_storeu_si128((__m128i *)to + 11, tmp); + _mm_storeu_si128((__m128i *)to + 12, tmp); + _mm_storeu_si128((__m128i *)to + 13, tmp); + _mm_storeu_si128((__m128i *)to + 14, tmp); + _mm_storeu_si128((__m128i *)to + 15, tmp); + _mm_storeu_si128((__m128i *)to + 16, tmp); + _mm_storeu_si128((__m128i *)to + 17, tmp); + _mm_storeu_si128((__m128i *)to + 18, tmp); + _mm_storeu_si128((__m128i *)to + 19, tmp); + _mm_storeu_si128((__m128i *)to + 20, tmp); + _mm_storeu_si128((__m128i *)to + 21, tmp); + _mm_storeu_si128((__m128i *)to + 22, tmp); + _mm_storeu_si128((__m128i *)to + 23, tmp); + _mm_storeu_si128((__m128i *)to + 24, tmp); + _mm_storeu_si128((__m128i *)to + 25, tmp); + _mm_storeu_si128((__m128i *)to + 26, tmp); + _mm_storeu_si128((__m128i *)to + 27, tmp); + _mm_storeu_si128((__m128i *)to + 28, tmp); + _mm_storeu_si128((__m128i *)to + 29, tmp); + _mm_storeu_si128((__m128i *)to + 30, tmp); + _mm_storeu_si128((__m128i *)to + 31, tmp); + _mm_storeu_si128((__m128i *)to + 32, tmp); + _mm_storeu_si128((__m128i *)to + 33, tmp); + _mm_storeu_si128((__m128i *)to + 34, tmp); + _mm_storeu_si128((__m128i *)to + 35, tmp); + _mm_storeu_si128((__m128i *)to + 36, tmp); + _mm_storeu_si128((__m128i *)to + 37, tmp); + _mm_storeu_si128((__m128i *)to + 38, tmp); + _mm_storeu_si128((__m128i *)to + 39, tmp); + _mm_storeu_si128((__m128i *)to + 40, tmp); + _mm_storeu_si128((__m128i *)to + 41, tmp); + _mm_storeu_si128((__m128i *)to + 42, tmp); + _mm_storeu_si128((__m128i *)to + 43, tmp); + _mm_storeu_si128((__m128i *)to + 44, tmp); + _mm_storeu_si128((__m128i *)to + 45, tmp); + _mm_storeu_si128((__m128i *)to + 46, tmp); + _mm_storeu_si128((__m128i *)to + 47, tmp); + _mm_storeu_si128((__m128i *)to + 48, tmp); + _mm_storeu_si128((__m128i *)to + 49, tmp); + _mm_storeu_si128((__m128i *)to + 50, tmp); + _mm_storeu_si128((__m128i *)to + 51, tmp); + _mm_storeu_si128((__m128i *)to + 52, tmp); + _mm_storeu_si128((__m128i *)to + 53, tmp); + _mm_storeu_si128((__m128i *)to + 54, tmp); + _mm_storeu_si128((__m128i *)to + 55, tmp); + _mm_storeu_si128((__m128i *)to + 56, tmp); + _mm_storeu_si128((__m128i *)to + 57, tmp); + _mm_storeu_si128((__m128i *)to + 58, tmp); + _mm_storeu_si128((__m128i *)to + 59, tmp); + _mm_storeu_si128((__m128i *)to + 60, tmp); + _mm_storeu_si128((__m128i *)to + 61, tmp); + _mm_storeu_si128((__m128i *)to + 62, tmp); + _mm_storeu_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x08: +#ifdef NO_ZEROS + tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_storeu_si128((__m128i *)to, tmp); + _mm_storeu_si128((__m128i *)to + 1, tmp); + _mm_storeu_si128((__m128i *)to + 2, tmp); + _mm_storeu_si128((__m128i *)to + 3, tmp); + _mm_storeu_si128((__m128i *)to + 4, tmp); + _mm_storeu_si128((__m128i *)to + 5, tmp); + _mm_storeu_si128((__m128i *)to + 6, tmp); + _mm_storeu_si128((__m128i *)to + 7, tmp); + _mm_storeu_si128((__m128i *)to + 8, tmp); + _mm_storeu_si128((__m128i *)to + 9, tmp); + _mm_storeu_si128((__m128i *)to + 10, tmp); + _mm_storeu_si128((__m128i *)to + 11, tmp); + _mm_storeu_si128((__m128i *)to + 12, tmp); + _mm_storeu_si128((__m128i *)to + 13, tmp); + _mm_storeu_si128((__m128i *)to + 14, tmp); + _mm_storeu_si128((__m128i *)to + 15, tmp); + _mm_storeu_si128((__m128i *)to + 16, tmp); + _mm_storeu_si128((__m128i *)to + 17, tmp); + _mm_storeu_si128((__m128i *)to + 18, tmp); + _mm_storeu_si128((__m128i *)to + 19, tmp); + _mm_storeu_si128((__m128i *)to + 20, tmp); + _mm_storeu_si128((__m128i *)to + 21, tmp); + _mm_storeu_si128((__m128i *)to + 22, tmp); + _mm_storeu_si128((__m128i *)to + 23, tmp); + _mm_storeu_si128((__m128i *)to + 24, tmp); + _mm_storeu_si128((__m128i *)to + 25, tmp); + _mm_storeu_si128((__m128i *)to + 26, tmp); + _mm_storeu_si128((__m128i *)to + 27, tmp); + _mm_storeu_si128((__m128i *)to + 28, tmp); + _mm_storeu_si128((__m128i *)to + 29, tmp); + _mm_storeu_si128((__m128i *)to + 30, tmp); + _mm_storeu_si128((__m128i *)to + 31, tmp); + _mm_storeu_si128((__m128i *)to + 32, tmp); + _mm_storeu_si128((__m128i *)to + 33, tmp); + _mm_storeu_si128((__m128i *)to + 34, tmp); + _mm_storeu_si128((__m128i *)to + 35, tmp); + _mm_storeu_si128((__m128i *)to + 36, tmp); + _mm_storeu_si128((__m128i *)to + 37, tmp); + _mm_storeu_si128((__m128i *)to + 38, tmp); + _mm_storeu_si128((__m128i *)to + 39, tmp); + _mm_storeu_si128((__m128i *)to + 40, tmp); + _mm_storeu_si128((__m128i *)to + 41, tmp); + _mm_storeu_si128((__m128i *)to + 42, tmp); + _mm_storeu_si128((__m128i *)to + 43, tmp); + _mm_storeu_si128((__m128i *)to + 44, tmp); + _mm_storeu_si128((__m128i *)to + 45, tmp); + _mm_storeu_si128((__m128i *)to + 46, tmp); + _mm_storeu_si128((__m128i *)to + 47, tmp); + _mm_storeu_si128((__m128i *)to + 48, tmp); + _mm_storeu_si128((__m128i *)to + 49, tmp); + _mm_storeu_si128((__m128i *)to + 50, tmp); + _mm_storeu_si128((__m128i *)to + 51, tmp); + _mm_storeu_si128((__m128i *)to + 52, tmp); + _mm_storeu_si128((__m128i *)to + 53, tmp); + _mm_storeu_si128((__m128i *)to + 54, tmp); + _mm_storeu_si128((__m128i *)to + 55, tmp); + _mm_storeu_si128((__m128i *)to + 56, tmp); + _mm_storeu_si128((__m128i *)to + 57, tmp); + _mm_storeu_si128((__m128i *)to + 58, tmp); + _mm_storeu_si128((__m128i *)to + 59, tmp); + _mm_storeu_si128((__m128i *)to + 60, tmp); + _mm_storeu_si128((__m128i *)to + 61, tmp); + _mm_storeu_si128((__m128i *)to + 62, tmp); + _mm_storeu_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x09: +#ifdef NO_ZEROS + tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_storeu_si128((__m128i *)to, tmp); + _mm_storeu_si128((__m128i *)to + 1, tmp); + _mm_storeu_si128((__m128i *)to + 2, tmp); + _mm_storeu_si128((__m128i *)to + 3, tmp); + _mm_storeu_si128((__m128i *)to + 4, tmp); + _mm_storeu_si128((__m128i *)to + 5, tmp); + _mm_storeu_si128((__m128i *)to + 6, tmp); + _mm_storeu_si128((__m128i *)to + 7, tmp); + _mm_storeu_si128((__m128i *)to + 8, tmp); + _mm_storeu_si128((__m128i *)to + 9, tmp); + _mm_storeu_si128((__m128i *)to + 10, tmp); + _mm_storeu_si128((__m128i *)to + 11, tmp); + _mm_storeu_si128((__m128i *)to + 12, tmp); + _mm_storeu_si128((__m128i *)to + 13, tmp); + _mm_storeu_si128((__m128i *)to + 14, tmp); + _mm_storeu_si128((__m128i *)to + 15, tmp); + _mm_storeu_si128((__m128i *)to + 16, tmp); + _mm_storeu_si128((__m128i *)to + 17, tmp); + _mm_storeu_si128((__m128i *)to + 18, tmp); + _mm_storeu_si128((__m128i *)to + 19, tmp); + _mm_storeu_si128((__m128i *)to + 20, tmp); + _mm_storeu_si128((__m128i *)to + 21, tmp); + _mm_storeu_si128((__m128i *)to + 22, tmp); + _mm_storeu_si128((__m128i *)to + 23, tmp); + _mm_storeu_si128((__m128i *)to + 24, tmp); + _mm_storeu_si128((__m128i *)to + 25, tmp); + _mm_storeu_si128((__m128i *)to + 26, tmp); + _mm_storeu_si128((__m128i *)to + 27, tmp); + _mm_storeu_si128((__m128i *)to + 28, tmp); + _mm_storeu_si128((__m128i *)to + 29, tmp); + _mm_storeu_si128((__m128i *)to + 30, tmp); + _mm_storeu_si128((__m128i *)to + 31, tmp); + _mm_storeu_si128((__m128i *)to + 32, tmp); + _mm_storeu_si128((__m128i *)to + 33, tmp); + _mm_storeu_si128((__m128i *)to + 34, tmp); + _mm_storeu_si128((__m128i *)to + 35, tmp); + _mm_storeu_si128((__m128i *)to + 36, tmp); + _mm_storeu_si128((__m128i *)to + 37, tmp); + _mm_storeu_si128((__m128i *)to + 38, tmp); + _mm_storeu_si128((__m128i *)to + 39, tmp); + _mm_storeu_si128((__m128i *)to + 40, tmp); + _mm_storeu_si128((__m128i *)to + 41, tmp); + _mm_storeu_si128((__m128i *)to + 42, tmp); + _mm_storeu_si128((__m128i *)to + 43, tmp); + _mm_storeu_si128((__m128i *)to + 44, tmp); + _mm_storeu_si128((__m128i *)to + 45, tmp); + _mm_storeu_si128((__m128i *)to + 46, tmp); + _mm_storeu_si128((__m128i *)to + 47, tmp); + _mm_storeu_si128((__m128i *)to + 48, tmp); + _mm_storeu_si128((__m128i *)to + 49, tmp); + _mm_storeu_si128((__m128i *)to + 50, tmp); + _mm_storeu_si128((__m128i *)to + 51, tmp); + _mm_storeu_si128((__m128i *)to + 52, tmp); + _mm_storeu_si128((__m128i *)to + 53, tmp); + _mm_storeu_si128((__m128i *)to + 54, tmp); + _mm_storeu_si128((__m128i *)to + 55, tmp); + _mm_storeu_si128((__m128i *)to + 56, tmp); + _mm_storeu_si128((__m128i *)to + 57, tmp); + _mm_storeu_si128((__m128i *)to + 58, tmp); + _mm_storeu_si128((__m128i *)to + 59, tmp); + _mm_storeu_si128((__m128i *)to + 60, tmp); + _mm_storeu_si128((__m128i *)to + 61, tmp); + _mm_storeu_si128((__m128i *)to + 62, tmp); + _mm_storeu_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x0a: +#ifdef NO_ZEROS + tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_storeu_si128((__m128i *)to, tmp); + _mm_storeu_si128((__m128i *)to + 1, tmp); + _mm_storeu_si128((__m128i *)to + 2, tmp); + _mm_storeu_si128((__m128i *)to + 3, tmp); + _mm_storeu_si128((__m128i *)to + 4, tmp); + _mm_storeu_si128((__m128i *)to + 5, tmp); + _mm_storeu_si128((__m128i *)to + 6, tmp); + _mm_storeu_si128((__m128i *)to + 7, tmp); + _mm_storeu_si128((__m128i *)to + 8, tmp); + _mm_storeu_si128((__m128i *)to + 9, tmp); + _mm_storeu_si128((__m128i *)to + 10, tmp); + _mm_storeu_si128((__m128i *)to + 11, tmp); + _mm_storeu_si128((__m128i *)to + 12, tmp); + _mm_storeu_si128((__m128i *)to + 13, tmp); + _mm_storeu_si128((__m128i *)to + 14, tmp); + _mm_storeu_si128((__m128i *)to + 15, tmp); + _mm_storeu_si128((__m128i *)to + 16, tmp); + _mm_storeu_si128((__m128i *)to + 17, tmp); + _mm_storeu_si128((__m128i *)to + 18, tmp); + _mm_storeu_si128((__m128i *)to + 19, tmp); + _mm_storeu_si128((__m128i *)to + 20, tmp); + _mm_storeu_si128((__m128i *)to + 21, tmp); + _mm_storeu_si128((__m128i *)to + 22, tmp); + _mm_storeu_si128((__m128i *)to + 23, tmp); + _mm_storeu_si128((__m128i *)to + 24, tmp); + _mm_storeu_si128((__m128i *)to + 25, tmp); + _mm_storeu_si128((__m128i *)to + 26, tmp); + _mm_storeu_si128((__m128i *)to + 27, tmp); + _mm_storeu_si128((__m128i *)to + 28, tmp); + _mm_storeu_si128((__m128i *)to + 29, tmp); + _mm_storeu_si128((__m128i *)to + 30, tmp); + _mm_storeu_si128((__m128i *)to + 31, tmp); + _mm_storeu_si128((__m128i *)to + 32, tmp); + _mm_storeu_si128((__m128i *)to + 33, tmp); + _mm_storeu_si128((__m128i *)to + 34, tmp); + _mm_storeu_si128((__m128i *)to + 35, tmp); + _mm_storeu_si128((__m128i *)to + 36, tmp); + _mm_storeu_si128((__m128i *)to + 37, tmp); + _mm_storeu_si128((__m128i *)to + 38, tmp); + _mm_storeu_si128((__m128i *)to + 39, tmp); + _mm_storeu_si128((__m128i *)to + 40, tmp); + _mm_storeu_si128((__m128i *)to + 41, tmp); + _mm_storeu_si128((__m128i *)to + 42, tmp); + _mm_storeu_si128((__m128i *)to + 43, tmp); + _mm_storeu_si128((__m128i *)to + 44, tmp); + _mm_storeu_si128((__m128i *)to + 45, tmp); + _mm_storeu_si128((__m128i *)to + 46, tmp); + _mm_storeu_si128((__m128i *)to + 47, tmp); + _mm_storeu_si128((__m128i *)to + 48, tmp); + _mm_storeu_si128((__m128i *)to + 49, tmp); + _mm_storeu_si128((__m128i *)to + 50, tmp); + _mm_storeu_si128((__m128i *)to + 51, tmp); + _mm_storeu_si128((__m128i *)to + 52, tmp); + _mm_storeu_si128((__m128i *)to + 53, tmp); + _mm_storeu_si128((__m128i *)to + 54, tmp); + _mm_storeu_si128((__m128i *)to + 55, tmp); + _mm_storeu_si128((__m128i *)to + 56, tmp); + _mm_storeu_si128((__m128i *)to + 57, tmp); + _mm_storeu_si128((__m128i *)to + 58, tmp); + _mm_storeu_si128((__m128i *)to + 59, tmp); + _mm_storeu_si128((__m128i *)to + 60, tmp); + _mm_storeu_si128((__m128i *)to + 61, tmp); + _mm_storeu_si128((__m128i *)to + 62, tmp); + _mm_storeu_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x0b: +#ifdef NO_ZEROS + tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_storeu_si128((__m128i *)to, tmp); + _mm_storeu_si128((__m128i *)to + 1, tmp); + _mm_storeu_si128((__m128i *)to + 2, tmp); + _mm_storeu_si128((__m128i *)to + 3, tmp); + _mm_storeu_si128((__m128i *)to + 4, tmp); + _mm_storeu_si128((__m128i *)to + 5, tmp); + _mm_storeu_si128((__m128i *)to + 6, tmp); + _mm_storeu_si128((__m128i *)to + 7, tmp); + _mm_storeu_si128((__m128i *)to + 8, tmp); + _mm_storeu_si128((__m128i *)to + 9, tmp); + _mm_storeu_si128((__m128i *)to + 10, tmp); + _mm_storeu_si128((__m128i *)to + 11, tmp); + _mm_storeu_si128((__m128i *)to + 12, tmp); + _mm_storeu_si128((__m128i *)to + 13, tmp); + _mm_storeu_si128((__m128i *)to + 14, tmp); + _mm_storeu_si128((__m128i *)to + 15, tmp); + _mm_storeu_si128((__m128i *)to + 16, tmp); + _mm_storeu_si128((__m128i *)to + 17, tmp); + _mm_storeu_si128((__m128i *)to + 18, tmp); + _mm_storeu_si128((__m128i *)to + 19, tmp); + _mm_storeu_si128((__m128i *)to + 20, tmp); + _mm_storeu_si128((__m128i *)to + 21, tmp); + _mm_storeu_si128((__m128i *)to + 22, tmp); + _mm_storeu_si128((__m128i *)to + 23, tmp); + _mm_storeu_si128((__m128i *)to + 24, tmp); + _mm_storeu_si128((__m128i *)to + 25, tmp); + _mm_storeu_si128((__m128i *)to + 26, tmp); + _mm_storeu_si128((__m128i *)to + 27, tmp); + _mm_storeu_si128((__m128i *)to + 28, tmp); + _mm_storeu_si128((__m128i *)to + 29, tmp); + _mm_storeu_si128((__m128i *)to + 30, tmp); + _mm_storeu_si128((__m128i *)to + 31, tmp); + _mm_storeu_si128((__m128i *)to + 32, tmp); + _mm_storeu_si128((__m128i *)to + 33, tmp); + _mm_storeu_si128((__m128i *)to + 34, tmp); + _mm_storeu_si128((__m128i *)to + 35, tmp); + _mm_storeu_si128((__m128i *)to + 36, tmp); + _mm_storeu_si128((__m128i *)to + 37, tmp); + _mm_storeu_si128((__m128i *)to + 38, tmp); + _mm_storeu_si128((__m128i *)to + 39, tmp); + _mm_storeu_si128((__m128i *)to + 40, tmp); + _mm_storeu_si128((__m128i *)to + 41, tmp); + _mm_storeu_si128((__m128i *)to + 42, tmp); + _mm_storeu_si128((__m128i *)to + 43, tmp); + _mm_storeu_si128((__m128i *)to + 44, tmp); + _mm_storeu_si128((__m128i *)to + 45, tmp); + _mm_storeu_si128((__m128i *)to + 46, tmp); + _mm_storeu_si128((__m128i *)to + 47, tmp); + _mm_storeu_si128((__m128i *)to + 48, tmp); + _mm_storeu_si128((__m128i *)to + 49, tmp); + _mm_storeu_si128((__m128i *)to + 50, tmp); + _mm_storeu_si128((__m128i *)to + 51, tmp); + _mm_storeu_si128((__m128i *)to + 52, tmp); + _mm_storeu_si128((__m128i *)to + 53, tmp); + _mm_storeu_si128((__m128i *)to + 54, tmp); + _mm_storeu_si128((__m128i *)to + 55, tmp); + _mm_storeu_si128((__m128i *)to + 56, tmp); + _mm_storeu_si128((__m128i *)to + 57, tmp); + _mm_storeu_si128((__m128i *)to + 58, tmp); + _mm_storeu_si128((__m128i *)to + 59, tmp); + _mm_storeu_si128((__m128i *)to + 60, tmp); + _mm_storeu_si128((__m128i *)to + 61, tmp); + _mm_storeu_si128((__m128i *)to + 62, tmp); + _mm_storeu_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x0c: +#ifdef NO_ZEROS + tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_storeu_si128((__m128i *)to, tmp); + _mm_storeu_si128((__m128i *)to + 1, tmp); + _mm_storeu_si128((__m128i *)to + 2, tmp); + _mm_storeu_si128((__m128i *)to + 3, tmp); + _mm_storeu_si128((__m128i *)to + 4, tmp); + _mm_storeu_si128((__m128i *)to + 5, tmp); + _mm_storeu_si128((__m128i *)to + 6, tmp); + _mm_storeu_si128((__m128i *)to + 7, tmp); + _mm_storeu_si128((__m128i *)to + 8, tmp); + _mm_storeu_si128((__m128i *)to + 9, tmp); + _mm_storeu_si128((__m128i *)to + 10, tmp); + _mm_storeu_si128((__m128i *)to + 11, tmp); + _mm_storeu_si128((__m128i *)to + 12, tmp); + _mm_storeu_si128((__m128i *)to + 13, tmp); + _mm_storeu_si128((__m128i *)to + 14, tmp); + _mm_storeu_si128((__m128i *)to + 15, tmp); + _mm_storeu_si128((__m128i *)to + 16, tmp); + _mm_storeu_si128((__m128i *)to + 17, tmp); + _mm_storeu_si128((__m128i *)to + 18, tmp); + _mm_storeu_si128((__m128i *)to + 19, tmp); + _mm_storeu_si128((__m128i *)to + 20, tmp); + _mm_storeu_si128((__m128i *)to + 21, tmp); + _mm_storeu_si128((__m128i *)to + 22, tmp); + _mm_storeu_si128((__m128i *)to + 23, tmp); + _mm_storeu_si128((__m128i *)to + 24, tmp); + _mm_storeu_si128((__m128i *)to + 25, tmp); + _mm_storeu_si128((__m128i *)to + 26, tmp); + _mm_storeu_si128((__m128i *)to + 27, tmp); + _mm_storeu_si128((__m128i *)to + 28, tmp); + _mm_storeu_si128((__m128i *)to + 29, tmp); + _mm_storeu_si128((__m128i *)to + 30, tmp); + _mm_storeu_si128((__m128i *)to + 31, tmp); + _mm_storeu_si128((__m128i *)to + 32, tmp); + _mm_storeu_si128((__m128i *)to + 33, tmp); + _mm_storeu_si128((__m128i *)to + 34, tmp); + _mm_storeu_si128((__m128i *)to + 35, tmp); + _mm_storeu_si128((__m128i *)to + 36, tmp); + _mm_storeu_si128((__m128i *)to + 37, tmp); + _mm_storeu_si128((__m128i *)to + 38, tmp); + _mm_storeu_si128((__m128i *)to + 39, tmp); + _mm_storeu_si128((__m128i *)to + 40, tmp); + _mm_storeu_si128((__m128i *)to + 41, tmp); + _mm_storeu_si128((__m128i *)to + 42, tmp); + _mm_storeu_si128((__m128i *)to + 43, tmp); + _mm_storeu_si128((__m128i *)to + 44, tmp); + _mm_storeu_si128((__m128i *)to + 45, tmp); + _mm_storeu_si128((__m128i *)to + 46, tmp); + _mm_storeu_si128((__m128i *)to + 47, tmp); + _mm_storeu_si128((__m128i *)to + 48, tmp); + _mm_storeu_si128((__m128i *)to + 49, tmp); + _mm_storeu_si128((__m128i *)to + 50, tmp); + _mm_storeu_si128((__m128i *)to + 51, tmp); + _mm_storeu_si128((__m128i *)to + 52, tmp); + _mm_storeu_si128((__m128i *)to + 53, tmp); + _mm_storeu_si128((__m128i *)to + 54, tmp); + _mm_storeu_si128((__m128i *)to + 55, tmp); + _mm_storeu_si128((__m128i *)to + 56, tmp); + _mm_storeu_si128((__m128i *)to + 57, tmp); + _mm_storeu_si128((__m128i *)to + 58, tmp); + _mm_storeu_si128((__m128i *)to + 59, tmp); + _mm_storeu_si128((__m128i *)to + 60, tmp); + _mm_storeu_si128((__m128i *)to + 61, tmp); + _mm_storeu_si128((__m128i *)to + 62, tmp); + _mm_storeu_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x0d: +#ifdef NO_ZEROS + tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_storeu_si128((__m128i *)to, tmp); + _mm_storeu_si128((__m128i *)to + 1, tmp); + _mm_storeu_si128((__m128i *)to + 2, tmp); + _mm_storeu_si128((__m128i *)to + 3, tmp); + _mm_storeu_si128((__m128i *)to + 4, tmp); + _mm_storeu_si128((__m128i *)to + 5, tmp); + _mm_storeu_si128((__m128i *)to + 6, tmp); + _mm_storeu_si128((__m128i *)to + 7, tmp); + _mm_storeu_si128((__m128i *)to + 8, tmp); + _mm_storeu_si128((__m128i *)to + 9, tmp); + _mm_storeu_si128((__m128i *)to + 10, tmp); + _mm_storeu_si128((__m128i *)to + 11, tmp); + _mm_storeu_si128((__m128i *)to + 12, tmp); + _mm_storeu_si128((__m128i *)to + 13, tmp); + _mm_storeu_si128((__m128i *)to + 14, tmp); + _mm_storeu_si128((__m128i *)to + 15, tmp); + _mm_storeu_si128((__m128i *)to + 16, tmp); + _mm_storeu_si128((__m128i *)to + 17, tmp); + _mm_storeu_si128((__m128i *)to + 18, tmp); + _mm_storeu_si128((__m128i *)to + 19, tmp); + _mm_storeu_si128((__m128i *)to + 20, tmp); + _mm_storeu_si128((__m128i *)to + 21, tmp); + _mm_storeu_si128((__m128i *)to + 22, tmp); + _mm_storeu_si128((__m128i *)to + 23, tmp); + _mm_storeu_si128((__m128i *)to + 24, tmp); + _mm_storeu_si128((__m128i *)to + 25, tmp); + _mm_storeu_si128((__m128i *)to + 26, tmp); + _mm_storeu_si128((__m128i *)to + 27, tmp); + _mm_storeu_si128((__m128i *)to + 28, tmp); + _mm_storeu_si128((__m128i *)to + 29, tmp); + _mm_storeu_si128((__m128i *)to + 30, tmp); + _mm_storeu_si128((__m128i *)to + 31, tmp); + _mm_storeu_si128((__m128i *)to + 32, tmp); + _mm_storeu_si128((__m128i *)to + 33, tmp); + _mm_storeu_si128((__m128i *)to + 34, tmp); + _mm_storeu_si128((__m128i *)to + 35, tmp); + _mm_storeu_si128((__m128i *)to + 36, tmp); + _mm_storeu_si128((__m128i *)to + 37, tmp); + _mm_storeu_si128((__m128i *)to + 38, tmp); + _mm_storeu_si128((__m128i *)to + 39, tmp); + _mm_storeu_si128((__m128i *)to + 40, tmp); + _mm_storeu_si128((__m128i *)to + 41, tmp); + _mm_storeu_si128((__m128i *)to + 42, tmp); + _mm_storeu_si128((__m128i *)to + 43, tmp); + _mm_storeu_si128((__m128i *)to + 44, tmp); + _mm_storeu_si128((__m128i *)to + 45, tmp); + _mm_storeu_si128((__m128i *)to + 46, tmp); + _mm_storeu_si128((__m128i *)to + 47, tmp); + _mm_storeu_si128((__m128i *)to + 48, tmp); + _mm_storeu_si128((__m128i *)to + 49, tmp); + _mm_storeu_si128((__m128i *)to + 50, tmp); + _mm_storeu_si128((__m128i *)to + 51, tmp); + _mm_storeu_si128((__m128i *)to + 52, tmp); + _mm_storeu_si128((__m128i *)to + 53, tmp); + _mm_storeu_si128((__m128i *)to + 54, tmp); + _mm_storeu_si128((__m128i *)to + 55, tmp); + _mm_storeu_si128((__m128i *)to + 56, tmp); + _mm_storeu_si128((__m128i *)to + 57, tmp); + _mm_storeu_si128((__m128i *)to + 58, tmp); + _mm_storeu_si128((__m128i *)to + 59, tmp); + _mm_storeu_si128((__m128i *)to + 60, tmp); + _mm_storeu_si128((__m128i *)to + 61, tmp); + _mm_storeu_si128((__m128i *)to + 62, tmp); + _mm_storeu_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x0e: +#ifdef NO_ZEROS + tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_storeu_si128((__m128i *)to, tmp); + _mm_storeu_si128((__m128i *)to + 1, tmp); + _mm_storeu_si128((__m128i *)to + 2, tmp); + _mm_storeu_si128((__m128i *)to + 3, tmp); + _mm_storeu_si128((__m128i *)to + 4, tmp); + _mm_storeu_si128((__m128i *)to + 5, tmp); + _mm_storeu_si128((__m128i *)to + 6, tmp); + _mm_storeu_si128((__m128i *)to + 7, tmp); + _mm_storeu_si128((__m128i *)to + 8, tmp); + _mm_storeu_si128((__m128i *)to + 9, tmp); + _mm_storeu_si128((__m128i *)to + 10, tmp); + _mm_storeu_si128((__m128i *)to + 11, tmp); + _mm_storeu_si128((__m128i *)to + 12, tmp); + _mm_storeu_si128((__m128i *)to + 13, tmp); + _mm_storeu_si128((__m128i *)to + 14, tmp); + _mm_storeu_si128((__m128i *)to + 15, tmp); + _mm_storeu_si128((__m128i *)to + 16, tmp); + _mm_storeu_si128((__m128i *)to + 17, tmp); + _mm_storeu_si128((__m128i *)to + 18, tmp); + _mm_storeu_si128((__m128i *)to + 19, tmp); + _mm_storeu_si128((__m128i *)to + 20, tmp); + _mm_storeu_si128((__m128i *)to + 21, tmp); + _mm_storeu_si128((__m128i *)to + 22, tmp); + _mm_storeu_si128((__m128i *)to + 23, tmp); + _mm_storeu_si128((__m128i *)to + 24, tmp); + _mm_storeu_si128((__m128i *)to + 25, tmp); + _mm_storeu_si128((__m128i *)to + 26, tmp); + _mm_storeu_si128((__m128i *)to + 27, tmp); + _mm_storeu_si128((__m128i *)to + 28, tmp); + _mm_storeu_si128((__m128i *)to + 29, tmp); + _mm_storeu_si128((__m128i *)to + 30, tmp); + _mm_storeu_si128((__m128i *)to + 31, tmp); + _mm_storeu_si128((__m128i *)to + 32, tmp); + _mm_storeu_si128((__m128i *)to + 33, tmp); + _mm_storeu_si128((__m128i *)to + 34, tmp); + _mm_storeu_si128((__m128i *)to + 35, tmp); + _mm_storeu_si128((__m128i *)to + 36, tmp); + _mm_storeu_si128((__m128i *)to + 37, tmp); + _mm_storeu_si128((__m128i *)to + 38, tmp); + _mm_storeu_si128((__m128i *)to + 39, tmp); + _mm_storeu_si128((__m128i *)to + 40, tmp); + _mm_storeu_si128((__m128i *)to + 41, tmp); + _mm_storeu_si128((__m128i *)to + 42, tmp); + _mm_storeu_si128((__m128i *)to + 43, tmp); + _mm_storeu_si128((__m128i *)to + 44, tmp); + _mm_storeu_si128((__m128i *)to + 45, tmp); + _mm_storeu_si128((__m128i *)to + 46, tmp); + _mm_storeu_si128((__m128i *)to + 47, tmp); + _mm_storeu_si128((__m128i *)to + 48, tmp); + _mm_storeu_si128((__m128i *)to + 49, tmp); + _mm_storeu_si128((__m128i *)to + 50, tmp); + _mm_storeu_si128((__m128i *)to + 51, tmp); + _mm_storeu_si128((__m128i *)to + 52, tmp); + _mm_storeu_si128((__m128i *)to + 53, tmp); + _mm_storeu_si128((__m128i *)to + 54, tmp); + _mm_storeu_si128((__m128i *)to + 55, tmp); + _mm_storeu_si128((__m128i *)to + 56, tmp); + _mm_storeu_si128((__m128i *)to + 57, tmp); + _mm_storeu_si128((__m128i *)to + 58, tmp); + _mm_storeu_si128((__m128i *)to + 59, tmp); + _mm_storeu_si128((__m128i *)to + 60, tmp); + _mm_storeu_si128((__m128i *)to + 61, tmp); + _mm_storeu_si128((__m128i *)to + 62, tmp); + _mm_storeu_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x0f: +#ifdef NO_ZEROS + tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_storeu_si128((__m128i *)to, tmp); + _mm_storeu_si128((__m128i *)to + 1, tmp); + _mm_storeu_si128((__m128i *)to + 2, tmp); + _mm_storeu_si128((__m128i *)to + 3, tmp); + _mm_storeu_si128((__m128i *)to + 4, tmp); + _mm_storeu_si128((__m128i *)to + 5, tmp); + _mm_storeu_si128((__m128i *)to + 6, tmp); + _mm_storeu_si128((__m128i *)to + 7, tmp); + _mm_storeu_si128((__m128i *)to + 8, tmp); + _mm_storeu_si128((__m128i *)to + 9, tmp); + _mm_storeu_si128((__m128i *)to + 10, tmp); + _mm_storeu_si128((__m128i *)to + 11, tmp); + _mm_storeu_si128((__m128i *)to + 12, tmp); + _mm_storeu_si128((__m128i *)to + 13, tmp); + _mm_storeu_si128((__m128i *)to + 14, tmp); + _mm_storeu_si128((__m128i *)to + 15, tmp); + _mm_storeu_si128((__m128i *)to + 16, tmp); + _mm_storeu_si128((__m128i *)to + 17, tmp); + _mm_storeu_si128((__m128i *)to + 18, tmp); + _mm_storeu_si128((__m128i *)to + 19, tmp); + _mm_storeu_si128((__m128i *)to + 20, tmp); + _mm_storeu_si128((__m128i *)to + 21, tmp); + _mm_storeu_si128((__m128i *)to + 22, tmp); + _mm_storeu_si128((__m128i *)to + 23, tmp); + _mm_storeu_si128((__m128i *)to + 24, tmp); + _mm_storeu_si128((__m128i *)to + 25, tmp); + _mm_storeu_si128((__m128i *)to + 26, tmp); + _mm_storeu_si128((__m128i *)to + 27, tmp); + _mm_storeu_si128((__m128i *)to + 28, tmp); + _mm_storeu_si128((__m128i *)to + 29, tmp); + _mm_storeu_si128((__m128i *)to + 30, tmp); + _mm_storeu_si128((__m128i *)to + 31, tmp); + _mm_storeu_si128((__m128i *)to + 32, tmp); + _mm_storeu_si128((__m128i *)to + 33, tmp); + _mm_storeu_si128((__m128i *)to + 34, tmp); + _mm_storeu_si128((__m128i *)to + 35, tmp); + _mm_storeu_si128((__m128i *)to + 36, tmp); + _mm_storeu_si128((__m128i *)to + 37, tmp); + _mm_storeu_si128((__m128i *)to + 38, tmp); + _mm_storeu_si128((__m128i *)to + 39, tmp); + _mm_storeu_si128((__m128i *)to + 40, tmp); + _mm_storeu_si128((__m128i *)to + 41, tmp); + _mm_storeu_si128((__m128i *)to + 42, tmp); + _mm_storeu_si128((__m128i *)to + 43, tmp); + _mm_storeu_si128((__m128i *)to + 44, tmp); + _mm_storeu_si128((__m128i *)to + 45, tmp); + _mm_storeu_si128((__m128i *)to + 46, tmp); + _mm_storeu_si128((__m128i *)to + 47, tmp); + _mm_storeu_si128((__m128i *)to + 48, tmp); + _mm_storeu_si128((__m128i *)to + 49, tmp); + _mm_storeu_si128((__m128i *)to + 50, tmp); + _mm_storeu_si128((__m128i *)to + 51, tmp); + _mm_storeu_si128((__m128i *)to + 52, tmp); + _mm_storeu_si128((__m128i *)to + 53, tmp); + _mm_storeu_si128((__m128i *)to + 54, tmp); + _mm_storeu_si128((__m128i *)to + 55, tmp); + _mm_storeu_si128((__m128i *)to + 56, tmp); + _mm_storeu_si128((__m128i *)to + 57, tmp); + _mm_storeu_si128((__m128i *)to + 58, tmp); + _mm_storeu_si128((__m128i *)to + 59, tmp); + _mm_storeu_si128((__m128i *)to + 60, tmp); + _mm_storeu_si128((__m128i *)to + 61, tmp); + _mm_storeu_si128((__m128i *)to + 62, tmp); + _mm_storeu_si128((__m128i *)to + 63, tmp); + to += 256; + break; + case 0x10: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x11: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x12: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x13: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x14: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x15: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x16: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x17: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x18: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x19: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x1a: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x1b: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x1c: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x1d: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x1e: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x1f: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + break; + case 0x20: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x21: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x22: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x23: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x24: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x25: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x26: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x27: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x28: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x29: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x2a: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x2b: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x2c: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x2d: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x2e: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x2f: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + break; + case 0x30: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x31: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x32: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x33: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x34: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x35: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x36: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x37: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x38: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x39: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x3a: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x3b: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x3c: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x3d: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x3e: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x3f: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + break; + case 0x40: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x41: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x42: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x43: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x44: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x45: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x46: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x47: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x48: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x49: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x4a: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x4b: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x4c: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x4d: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x4e: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x4f: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + break; + case 0x50: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x51: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x52: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x53: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x54: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x55: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x56: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x57: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x58: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x59: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x5a: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x5b: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x5c: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x5d: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x5e: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x5f: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + break; + case 0x60: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x61: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x62: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x63: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x64: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x65: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x66: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x67: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x68: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x69: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x6a: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x6b: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x6c: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x6d: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x6e: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x6f: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + break; + case 0x70: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x71: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x72: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x73: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x74: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x75: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x76: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x77: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x78: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x79: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x7a: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x7b: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x7c: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x7d: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x7e: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x7f: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + break; + case 0x80: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x81: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x82: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x83: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x84: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x85: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x86: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x87: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x88: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x89: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x8a: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x8b: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x8c: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x8d: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x8e: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x8f: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + break; + case 0x90: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x91: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x92: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x93: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x94: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x95: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x96: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x97: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x98: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x99: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x9a: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x9b: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x9c: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x9d: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x9e: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x9f: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + break; + case 0xa0: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xa1: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xa2: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xa3: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xa4: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xa5: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xa6: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xa7: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xa8: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xa9: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xaa: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xab: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xac: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xad: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xae: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xaf: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + break; + case 0xb0: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xb1: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xb2: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xb3: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xb4: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xb5: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xb6: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xb7: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xb8: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xb9: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xba: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xbb: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xbc: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xbd: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xbe: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xbf: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + break; + case 0xc0: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xc1: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xc2: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xc3: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xc4: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xc5: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xc6: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xc7: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xc8: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xc9: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xca: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xcb: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xcc: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xcd: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xce: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xcf: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + break; + case 0xd0: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); + in += 32; + to += 12; + case 0xd1: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); + in += 32; + to += 12; + case 0xd2: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); + in += 32; + to += 12; + case 0xd3: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); + in += 32; + to += 12; + case 0xd4: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); + in += 32; + to += 12; + case 0xd5: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); + in += 32; + to += 12; + case 0xd6: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); + in += 32; + to += 12; + case 0xd7: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); + in += 32; + to += 12; + case 0xd8: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); + in += 32; + to += 12; + case 0xd9: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); + in += 32; + to += 12; + case 0xda: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); + in += 32; + to += 12; + case 0xdb: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); + in += 32; + to += 12; + case 0xdc: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); + in += 32; + to += 12; + case 0xdd: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); + in += 32; + to += 12; + case 0xde: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); + in += 32; + to += 12; + case 0xdf: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); + in += 32; + to += 12; + break; + case 0xe0: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xe1: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xe2: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xe3: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xe4: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xe5: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xe6: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xe7: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xe8: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xe9: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xea: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xeb: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xec: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xed: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xee: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xef: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, tmp); + in += 16; + to += 4; + break; + case 0xf0: + *to = *(uint8_t *)in; + in += 1; + to += 1; + case 0xf1: + *to = *(uint8_t *)in; + in += 1; + to += 1; + case 0xf2: + *to = *(uint8_t *)in; + in += 1; + to += 1; + case 0xf3: + *to = *(uint8_t *)in; + in += 1; + to += 1; + break; + case 0xf4: + *to = *(uint16_t *)in; + in += 2; + to += 1; + case 0xf5: + *to = *(uint16_t *)in; + in += 2; + to += 1; + case 0xf6: + *to = *(uint16_t *)in; + in += 2; + to += 1; + case 0xf7: + *to = *(uint16_t *)in; + in += 2; + to += 1; + break; + case 0xf8: + *to = (*(uint8_t *)in << 16) | (*(uint8_t *)(in + 1) << 8) | (*(uint8_t *)(in + 2)); + in += 3; + to += 1; + case 0xf9: + *to = (*(uint8_t *)in << 16) | (*(uint8_t *)(in + 1) << 8) | (*(uint8_t *)(in + 2)); + in += 3; + to += 1; + case 0xfa: + *to = (*(uint8_t *)in << 16) | (*(uint8_t *)(in + 1) << 8) | (*(uint8_t *)(in + 2)); + in += 3; + to += 1; + case 0xfb: + *to = (*(uint8_t *)in << 16) | (*(uint8_t *)(in + 1) << 8) | (*(uint8_t *)(in + 2)); + in += 3; + to += 1; + break; + case 0xfc: + *to = *(uint32_t *)in; + in += 4; + to += 1; + case 0xfd: + *to = *(uint32_t *)in; + in += 4; + to += 1; + case 0xfe: + *to = *(uint32_t *)in; + in += 4; + to += 1; + case 0xff: + *to = *(uint32_t *)in; + in += 4; + to += 1; + break; + break; + } + } +} diff --git a/ext/bench_/bench/compress_qmx_v2.cpp b/ext/bench_/bench/compress_qmx_v2.cpp new file mode 100644 index 0000000..784f11a --- /dev/null +++ b/ext/bench_/bench/compress_qmx_v2.cpp @@ -0,0 +1,1468 @@ +/* + COMPRESS_QMX_V2.CPP + ------------------- + Copyright (c) 2014 by Andrew Trotman + Licensed BSD + + A version of BinPacking where we pack into a 128-bit SSE register the following: + 256 0-bit words + 128 1-bit words + 64 2-bit words + 40 3-bit words + 32 4-bit words + 24 5-bit words + 20 6-bit words + 16 8-bit words + 12 10-bit words + 8 16-bit words + 4 32-bit words + or pack into two 128-bit words (i.e. 256 bits) the following: + 36 7-bit words + 28 9-bit words + 20 12-bit words + 12 21-bit words + + This gives us 15 possible combinations. The combinaton is stored in the top 4 bits of a selector byte. The + bottom 4-bits of the selector store a run-length (the number of such sequences seen in a row. + + The 128-bit (or 256-bit) packed binary values are stored first. Then we store the selectors, Finally, + stored variable byte encoded, is a pointer to the start of the selector (from the end of the sequence). + + This way, all reads and writes are 128-bit word aligned, except addressing the selector (and the pointer + the selector). These reads are byte aligned. + + Note: There is currently 1 unused encoding (i.e. 16 unused selecvtor values). These might in the future be + used for encoding exceptions, much as PForDelta does. +*/ +#include +#include +#include +#include +#include +#include "compress_qmx_v2.h" + +//#define MAKE_DECOMPRESS 1 /* uncomment this and it will create a program that writes the decompressor */ +//#define TEST_ONE_STRING 1 /* Uncomment this and it will create a program that can be used to test the compressor and decompressor */ +#define NO_ZEROS 1 /* stores runs of 256 1s in a row (not 1-bit number, but actual 1 values). */ +#define SHORT_END_BLOCKS 1 + +#ifdef _MSC_VER + #define ALIGN_16 __declspec(align(16)) +#else + #define ALIGN_16 __attribute__ ((aligned (16))) +#endif + +//#define STATS /* uncomment this and it will count the selector usage */ +#ifdef STATS + static uint32_t stats[65] = {0}; +#endif + +/* + ANT_COMPRESS_QMX_V2::ANT_COMPRESS_QMX_V2() + ------------------------------------ +*/ +ANT_compress_qmx_v2::ANT_compress_qmx_v2() +{ +length_buffer = NULL; +length_buffer_length = 0; +} + +/* + ANT_COMPRESS_QMX_V2::~ANT_COMPRESS_QMX_V2() + ------------------------------------- +*/ +ANT_compress_qmx_v2::~ANT_compress_qmx_v2() +{ +delete [] length_buffer; +#ifdef STATS + uint32_t which; + for (which = 0; which <= 32; which++) + if (stats[which] != 0) + printf("%d\t%d\ttimes\n", which, stats[which]); +#endif +} + +/* + BYTES_NEEDED_FOR() + ------------------ +*/ +static uint8_t bytes_needed_for(uint32_t value) +{ +if (value <= 0xFF) + return 1; +else if (value <= 0xFFFF) + return 2; +else if (value <= 0xFFFFFF) + return 3; +else + return 4; +} + +/* + BITS_NEEDED_FOR() + ----------------- +*/ +static uint8_t bits_needed_for(uint32_t value) +{ +if (value == 0x01) + return 0; +else if (value <= 0x01) + return 1; +else if (value <= 0x03) + return 2; +else if (value <= 0x07) + return 3; +else if (value <= 0x0F) + return 4; +else if (value <= 0x1F) + return 5; +else if (value <= 0x3F) + return 6; +else if (value <= 0x7F) + return 7; +else if (value <= 0xFF) + return 8; +else if (value <= 0x1FF) + return 9; +else if (value <= 0x3FF) + return 10; +else if (value <= 0xFFF) + return 12; +else if (value <= 0xFFFF) + return 16; +else if (value <= 0x1FFFFF) + return 21; +else + return 32; +} + +/* + WRITE_OUT() + ----------- +*/ +static void write_out(uint8_t **buffer, uint32_t *source, uint32_t raw_count, uint32_t size_in_bits, uint8_t **length_buffer) +{ +uint32_t current, batch; +uint8_t *destination = *buffer; +uint32_t *end = source + raw_count; +uint8_t *key_store = *length_buffer; +uint32_t ALIGN_16 sequence_buffer[4]; +uint32_t instance, value; +uint8_t type; +uint32_t count; + +uint32_t max_bytes = 1; // this is the bytw-width for type128 encoded non-SSE integers + +#ifdef STATS + stats[size_in_bits] += raw_count; +#endif + +if (size_in_bits == 0) + { + type = 0; + count = (raw_count + 255) / 256; + } +else if (size_in_bits == 1) + { + type = 1; // 1 bit per integer + count = (raw_count + 127) / 128; + } +else if (size_in_bits == 2) + { + type = 2; // 2 bits per integer + count = (raw_count + 63) / 64; + } +else if (size_in_bits == 3) + { + type = 3; // 3 bits per integer + count = (raw_count + 39) / 40; + } +else if (size_in_bits == 4) + { + type = 4; // 4 bits per integer + count = (raw_count + 31) / 32; + } +else if (size_in_bits == 5) + { + type = 5; // 5 bits per integer + count = (raw_count + 23) / 24; + } +else if (size_in_bits == 6) + { + type = 6; // 6 bits per integer + count = (raw_count + 19) / 20; + } +else if (size_in_bits == 7) + { + type = 7; // 7 bits per integer, 18 integers per read (but requires 2 reads) + count = (raw_count + 35) / 36; + } +else if (size_in_bits == 8) + { + type = 8; // 8 bits per integer + count = (raw_count + 15) / 16; + } +else if (size_in_bits == 9) + { + type = 9; // 9 bits per integer, 14 integers per read (but requires 2 reads) + count = (raw_count + 27) / 28; + } +else if (size_in_bits == 10) + { + type = 10; // 10 bits per integer + count = (raw_count + 11) / 12; + } +else if (size_in_bits == 12) + { + type = 11; // 12 bits per integer, 10 integers per read (but requires 2 reads) + count = (raw_count + 19) / 20; + } +else if (size_in_bits == 16) + { + type = 12; // 16 bits per integer + count = (raw_count + 7) / 8; + } +else if (size_in_bits == 21) + { + type = 13; // 21 bits per integer, 6 integers per read (but requires 2 reads) + count = (raw_count + 11) / 12; + } +else if (size_in_bits == 32) + { + type = 14; // 32 bits per integer + count = (raw_count + 3) / 4; + } +else if (size_in_bits == 128) + { + type = 15; + count = raw_count; + /* + As the count for type 128 can only be 1, 2, or 3, we can re-appropriate it and store the bit-length in there too. + */ + max_bytes = 1; + for (uint32_t integer = 0; integer < count; integer++) + { + if (bytes_needed_for(source[integer]) > max_bytes) + max_bytes = bytes_needed_for(source[integer]); + } + } +else + exit(printf("Can't compress into integers of size %dbits\n", size_in_bits)); + +while (count > 0) + { + batch = count > 16 ? 16 : count; + *key_store++ = (type << 4) | (~(batch - 1) & 0x0F); + + count -= batch; + + for (current = 0; current < batch; current++) + { + switch (size_in_bits) + { + case 0: // 0 bits per integer (i.e. a long sequence of zeros) + /* + In this case we don't need to store a 4 byte integer because its implicit + */ + source += 256; + break; + case 1: // 1 bit per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 128; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 1); + + memcpy(destination, sequence_buffer, 16); + destination += 16; + source += 128; + break; + case 2: // 2 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 64; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 2); + + memcpy(destination, sequence_buffer, 16); + destination += 16; + source += 64; + break; + case 3: // 3 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 40; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 3); + + memcpy(destination, sequence_buffer, 16); + destination += 16; + source += 40; + break; + case 4: // 4 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 32; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 4); + + memcpy(destination, sequence_buffer, 16); + destination += 16; + source += 32; + break; + case 5: // 5 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 24; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 5); + + memcpy(destination, sequence_buffer, 16); + destination += 16; + source += 24; + break; + case 6: // 6 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 20; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 6); + memcpy(destination, sequence_buffer, 16); + destination += 16; + source += 20; + break; + case 7: // 7 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 20; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 7); + memcpy(destination, sequence_buffer, 16); + destination += 16; + + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 16; value < 20; value++) + sequence_buffer[value & 0x03] |= source[value] >> 4; + for (value = 20; value < 36; value++) + sequence_buffer[value & 0x03] |= source[value] << (((value - 20) / 4) * 7 + 3); + memcpy(destination, sequence_buffer, 16); + + destination += 16; + source += 36; // 36 in a double 128-bit word + break; + case 8: // 8 bits per integer +#ifdef SHORT_END_BLOCKS + for (instance = 0; instance < 16 && source < end; instance++) +#else + for (instance = 0; instance < 16; instance++) +#endif + *destination++ = (uint8_t)*source++; + break; + case 9: // 9 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 16; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 9); + memcpy(destination, sequence_buffer, 16); + destination += 16; + + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 12; value < 16; value++) + sequence_buffer[value & 0x03] |= source[value] >> 5; + for (value = 16; value < 28; value++) + sequence_buffer[value & 0x03] |= source[value] << (((value - 16) / 4) * 9 + 4); + memcpy(destination, sequence_buffer, 16); + + destination += 16; + source += 28; // 28 in a double 128-bit word + break; + case 10: // 10 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 12; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 10); + + memcpy(destination, sequence_buffer, 16); + destination += 16; + source += 12; + break; + case 12: // 12 bit integers + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 12; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 12); + memcpy(destination, sequence_buffer, 16); + destination += 16; + + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 8; value < 12; value++) + sequence_buffer[value & 0x03] |= source[value] >> 8; + for (value = 12; value < 20; value++) + sequence_buffer[value & 0x03] |= source[value] << (((value - 12) / 4) * 12 + 8); + memcpy(destination, sequence_buffer, 16); + + destination += 16; + source += 20; // 20 in a double 128-bit word + break; + case 16: // 16 bits per integer +#ifdef SHORT_END_BLOCKS + for (instance = 0; instance < 8 && source < end; instance++) +#else + for (instance = 0; instance < 8; instance++) +#endif + { + *(uint16_t *)destination = (uint16_t)*source++; + destination += 2; + } + break; + case 21: // 21 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 8; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 21); + memcpy(destination, sequence_buffer, 16); + destination += 16; + + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 4; value < 8; value++) + sequence_buffer[value & 0x03] |= source[value] >> 11; + for (value = 8; value < 12; value++) + sequence_buffer[value & 0x03] |= source[value] << (((value - 8) / 4) * 21 + 11); + memcpy(destination, sequence_buffer, 16); + + destination += 16; + source += 12; // 12 in a double 128-bit word + break; + case 32: // 32 bits per integer +#ifdef SHORT_END_BLOCKS + for (instance = 0; instance < 4 && source < end; instance++) +#else + for (instance = 0; instance < 4; instance++) +#endif + { + *(uint32_t *)destination = (uint32_t)*source++; + destination += 4; + } + break; + case 128: + if (max_bytes == 1) + { + *(uint8_t *)destination = (uint8_t)*source; + source++; + destination += 1; + *(key_store - 1) = (type << 4) | (~(batch - 1) & 0x03); + } + else if (max_bytes == 2) + { + *(uint16_t *)destination = (uint16_t)*source; + source++; + destination += 2; + *(key_store - 1) = (type << 4) | 4 | (~(batch - 1) & 0x03); + } + else if (max_bytes == 3) + { + *destination++ = (uint8_t)((*source >> 16) & 0xFF); + *destination++ = (uint8_t)((*source >> 8) & 0xFF); + *destination++ = (uint8_t)((*source >> 0) & 0xFF); + source++; + + *(key_store - 1) = (type << 4) | 8 | (~(batch - 1) & 0x03); + } + else if (max_bytes == 4) + { + *(uint32_t *)destination = (uint32_t)*source; + source++; + destination += 4; + *(key_store - 1) = (type << 4) | 0x0C | (~(batch - 1) & 0x03); + } + else + printf("max_bytes must be 1, 2, 3, or 4, but is:%d", (int)max_bytes); + break; + } + } + } +*buffer = destination; +*length_buffer = key_store; +} + +/* + MAX() + ----- +*/ +template +T max(T a, T b) +{ +return a > b ? a : b; +} + +/* + MAX() + ----- +*/ +template +T max(T a, T b, T c, T d) +{ +return max(max(a, b), max(c, d)); +} + +/* + ANT_COMPRESS_QMX_V2::ENCODEARRAY() + ------------------------------- +*/ +void ANT_compress_qmx_v2::encodeArray(const uint32_t *source, uint64_t source_integers, uint32_t *into, uint64_t *nvalue) +{ +const uint32_t WASTAGE = 512; +uint8_t *current_length, *destination = (uint8_t *)into, *keys; +uint32_t *current, run_length, bits, new_needed, wastage; +uint32_t block, largest; + +/* + make sure we have enough room to store the lengths +*/ +if (length_buffer_length < source_integers) + { + delete [] length_buffer; + length_buffer = new uint8_t [(size_t)(length_buffer_length = source_integers) + WASTAGE]; + } + +/* + Get the lengths of the integers +*/ +current_length = length_buffer; +for (current = (uint32_t *)source; current < source + source_integers; current++) + *current_length++ = bits_needed_for(*current); + +/* + Shove a bunch of 0 length integers on the end to allow for overflow +*/ +for (wastage = 0; wastage < WASTAGE; wastage++) + *current_length++ = 0; + +/* + Process the lengths. To maximise SSE throughput we need each write to be 128-bit (4*32-bit) alignned + and therefore we need each compress "block" to be the same size where a compress "block" is a set of + four encoded integers starting on a 4-integer boundary. +*/ +for (current_length = length_buffer; current_length < length_buffer + source_integers + 4; current_length += 4) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = max(*current_length, *(current_length + 1), *(current_length + 2), *(current_length + 3)); + +/* + This code makes sure we can do aligned reads, promoting to larger integers if necessary +*/ +current_length = length_buffer; +while (current_length < length_buffer + source_integers) + { +#ifdef SHORT_END_BLOCKS + /* + If there are fewer than 16 values remaining and they all fit into 8-bits then its smaller than storing stripes + If there are fewer than 8 values remaining and they all fit into 16-bits then its smaller than storing stripes + If there are fewer than 4 values remaining and they all fit into 32-bits then its smaller than storing stripes + */ + if (source_integers - (current_length - length_buffer) < 4) + { + largest = 0; + for (block = 0; block < 8; block++) + largest = max((uint8_t)largest, *(current_length + block)); + if (largest <= 8) + for (block = 0; block < 8; block++) + *(current_length + block) = 8; + else if (largest <= 16) + for (block = 0; block < 8; block++) + *(current_length + block) = 16; + else if (largest <= 32) + for (block = 0; block < 8; block++) + *(current_length + block) = 32; + } + else if (source_integers - (current_length - length_buffer) < 8) + { + largest = 0; + for (block = 0; block < 8; block++) + largest = max((uint8_t)largest, *(current_length + block)); + if (largest <= 8) + for (block = 0; block < 8; block++) + *(current_length + block) = 8; + else if (largest <= 8) + for (block = 0; block < 8; block++) + *(current_length + block) = 16; + } + else if (source_integers - (current_length - length_buffer) < 16) + { + largest = 0; + for (block = 0; block < 16; block++) + largest = max((uint8_t)largest, *(current_length + block)); + if (largest <= 8) + for (block = 0; block < 16; block++) + *(current_length + block) = 8; + } + /* + Otherwise we have the standard rules for a block + */ +#endif + /* + Two things need to happen to be able to use a particular selector. The first is that all the + values that would end up in that block need to use at most the bit value of that block. + The second is that there need to be at least as many numbers remaining as the block encodes. + + For example, if the current block only needs 0-bits per int, then check that the 256 values + that would be encoded only take 0-bits. If any value needs more, or there aren't 256 numbers remaining, + then promote the current block to try encode 128 1-bit values. + */ + switch (*current_length) + { + case 0: + if (source_integers - (current_length - length_buffer) < 256) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 1; // promote + break; + } + for (block = 0; block < 256; block += 4) + if (*(current_length + block) > 0) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 1; // promote + if (*current_length == 0) + { + for (block = 0; block < 256; block++) + current_length[block] = 0; + current_length += 256; + } + break; + case 1: + if (source_integers - (current_length - length_buffer) < 128) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 2; // promote + break; + } + for (block = 0; block < 128; block += 4) + if (*(current_length + block) > 1) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 2; // promote + if (*current_length == 1) + { + for (block = 0; block < 128; block++) + current_length[block] = 1; + current_length += 128; + } + break; + case 2: + if (source_integers - (current_length - length_buffer) < 64) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 3; // promote + break; + } + for (block = 0; block < 64; block += 4) + if (*(current_length + block) > 2) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 3; // promote + if (*current_length == 2) + { + for (block = 0; block < 64; block++) + current_length[block] = 2; + current_length += 64; + } + break; + case 3: + if (source_integers - (current_length - length_buffer) < 40) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 4; // promote + break; + } + for (block = 0; block < 40; block += 4) + if (*(current_length + block) > 3) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 4; // promote + if (*current_length == 3) + { + for (block = 0; block < 40; block++) + current_length[block] = 3; + current_length += 40; + } + break; + case 4: + if (source_integers - (current_length - length_buffer) < 32) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 5; // promote + break; + } + for (block = 0; block < 32; block += 4) + if (*(current_length + block) > 4) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 5; // promote + if (*current_length == 4) + { + for (block = 0; block < 32; block++) + current_length[block] = 4; + current_length += 32; + } + break; + case 5: + if (source_integers - (current_length - length_buffer) < 24) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 6; // promote + break; + } + for (block = 0; block < 24; block += 4) + if (*(current_length + block) > 5) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 6; // promote + if (*current_length == 5) + { + for (block = 0; block < 24; block++) + current_length[block] = 5; + current_length += 24; + } + break; + case 6: + if (source_integers - (current_length - length_buffer) < 20) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 7; // promote + break; + } + for (block = 0; block < 20; block += 4) + if (*(current_length + block) > 6) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 7; // promote + if (*current_length == 6) + { + for (block = 0; block < 20; block++) + current_length[block] = 6; + current_length += 20; + } + break; + case 7: + if (source_integers - (current_length - length_buffer) < 36) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 8; // promote + break; + } + for (block = 0; block < 36; block += 4) // 36 in a double 128-bit word + if (*(current_length + block) > 7) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 8; // promote + if (*current_length == 7) + { + for (block = 0; block < 36; block++) + current_length[block] = 7; + current_length += 36; + } + break; + case 8: + if (source_integers - (current_length - length_buffer) < 16) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 9; // promote + break; + } + for (block = 0; block < 16; block += 4) + if (*(current_length + block) > 8) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 9; // promote + if (*current_length == 8) + { + for (block = 0; block < 16; block++) + current_length[block] = 8; + current_length += 16; + } + break; + case 9: + if (source_integers - (current_length - length_buffer) < 28) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 10; // promote + break; + } + for (block = 0; block < 28; block += 4) // 28 in a double 128-bit word + if (*(current_length + block) > 9) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 10; // promote + if (*current_length == 9) + { + for (block = 0; block < 28; block++) + current_length[block] = 9; + current_length += 28; + } + break; + case 10: + if (source_integers - (current_length - length_buffer) < 12) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 12; // promote + break; + } + for (block = 0; block < 12; block += 4) + if (*(current_length + block) > 10) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 12; // promote + if (*current_length == 10) + { + for (block = 0; block < 12; block++) + current_length[block] = 10; + current_length += 12; + } + break; + case 12: + if (source_integers - (current_length - length_buffer) < 20) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 16; // promote + break; + } + for (block = 0; block < 20; block += 4) // 20 in a double 128-bit word + if (*(current_length + block) > 12) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 16; // promote + if (*current_length == 12) + { + for (block = 0; block < 20; block++) + current_length[block] = 12; + current_length += 20; + } + break; + case 16: + if (source_integers - (current_length - length_buffer) < 8) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 21; // promote + break; + } + for (block = 0; block < 8; block += 4) + if (*(current_length + block) > 16) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 21; // promote + if (*current_length == 16) + { + for (block = 0; block < 8; block++) + current_length[block] = 16; + current_length += 8; + } + break; + case 21: + if (source_integers - (current_length - length_buffer) < 12) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 32; // promote + break; + } + for (block = 0; block < 12; block += 4) // 12 in a double 128-bit word + if (*(current_length + block) > 21) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 32; // promote + if (*current_length == 21) + { + for (block = 0; block < 12; block++) + current_length[block] = 21; + current_length += 12; + } + break; + case 32: + if (source_integers - (current_length - length_buffer) < 4) + { + for (block = 0; block < (source_integers - (current_length - length_buffer)); block++) + *(current_length + block) = 128; // promote + break; + } + for (block = 0; block < 4; block += 4) + if (*(current_length + block) > 32) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 64; // promote + if (*current_length == 32) + { + for (block = 0; block < 4; block++) + current_length[block] = 32; + current_length += 4; + } + break; + case 128: + /* + The 128-bit selector is used as a last resort when there are not enough numbers to use an + earlier selector. So don't worry about checking the rest. + */ + current_length += source_integers - (current_length - length_buffer); + break; + default: + exit(printf("Selecting on a non whole power of 2, must exit\n")); + break; + } + } + +/* + We can now compress based on the lengths in length_buffer +*/ +run_length = 1; +bits = length_buffer[0]; +keys = length_buffer; // we're going to re-use the length_buffer because it can't overlap and this saves a double malloc +for (current = (uint32_t *)source + 1; current < source + source_integers; current++) + { + new_needed = length_buffer[current - source]; + if (new_needed == bits) + run_length++; + else + { + write_out(&destination, (uint32_t *)current - run_length, run_length, bits, &keys); + bits = new_needed; + run_length = 1; + } + } +write_out(&destination, (uint32_t *)current - run_length, run_length, bits, &keys); + +/* + Copy the lengths to the end, backwards +*/ +uint8_t *from = length_buffer + (keys - length_buffer) - 1; +uint8_t *to = destination; +for (uint32_t pos = 0; pos < keys - length_buffer; pos++) + *to++ = *from--; +destination += keys - length_buffer; + +/* + Compute the length (in bytes) +*/ +*nvalue = destination - (uint8_t *)into; // return length in bytes +} + +#ifdef MAKE_DECOMPRESS + /* + The following program generates the source code for ANT_compress_qmx_v2::decodeArray() + */ + /* + MAIN() + ------ + This version assumes SSE4.1 and so it is *not* portable to non X86 architectures + */ + int main(void) + { + uint32_t instance; + + printf("static uint32_t ALIGN_16 static_mask_21[] = {0x1fffff, 0x1fffff, 0x1fffff, 0x1fffff};\n"); + printf("static uint32_t ALIGN_16 static_mask_12[] = {0xfff, 0xfff, 0xfff, 0xfff};\n"); + printf("static uint32_t ALIGN_16 static_mask_10[] = {0x3ff, 0x3ff, 0x3ff, 0x3ff};\n"); + printf("static uint32_t ALIGN_16 static_mask_9[] = {0x1ff, 0x1ff, 0x1ff, 0x1ff};\n"); + printf("static uint32_t ALIGN_16 static_mask_7[] = {0x7f, 0x7f, 0x7f, 0x7f};\n"); + printf("static uint32_t ALIGN_16 static_mask_6[] = {0x3f, 0x3f, 0x3f, 0x3f};\n"); + printf("static uint32_t ALIGN_16 static_mask_5[] = {0x1f, 0x1f, 0x1f, 0x1f};\n"); + printf("static uint32_t ALIGN_16 static_mask_4[] = {0x0f, 0x0f, 0x0f, 0x0f};\n"); + printf("static uint32_t ALIGN_16 static_mask_3[] = {0x07, 0x07, 0x07, 0x07};\n"); + printf("static uint32_t ALIGN_16 static_mask_2[] = {0x03, 0x03, 0x03, 0x03};\n"); + printf("static uint32_t ALIGN_16 static_mask_1[] = {0x01, 0x01, 0x01, 0x01};\n"); + printf("void ANT_compress_qmx_v2::decodeArray(const uint32_t *source, uint64_t len, uint32_t *to, uint64_t destination_integers)\n"); + printf("{\n"); + printf("__m128i byte_stream, byte_stream_2, tmp, tmp2, mask_21, mask_12, mask_10, mask_9, mask_7, mask_6, mask_5, mask_4, mask_3, mask_2, mask_1;\n"); + printf("uint8_t *in = (uint8_t *)source;\n"); + printf("uint8_t *keys = ((uint8_t *)source) + len - 1;\n"); + + printf("\n"); + printf("mask_21 = _mm_loadu_si128((__m128i *)static_mask_21);\n"); + printf("mask_12 = _mm_loadu_si128((__m128i *)static_mask_12);\n"); + printf("mask_10 = _mm_loadu_si128((__m128i *)static_mask_10);\n"); + printf("mask_9 = _mm_loadu_si128((__m128i *)static_mask_9);\n"); + printf("mask_7 = _mm_loadu_si128((__m128i *)static_mask_7);\n"); + printf("mask_6 = _mm_loadu_si128((__m128i *)static_mask_6);\n"); + printf("mask_5 = _mm_loadu_si128((__m128i *)static_mask_5);\n"); + printf("mask_4 = _mm_loadu_si128((__m128i *)static_mask_4);\n"); + printf("mask_3 = _mm_loadu_si128((__m128i *)static_mask_3);\n"); + printf("mask_2 = _mm_loadu_si128((__m128i *)static_mask_2);\n"); + printf("mask_1 = _mm_loadu_si128((__m128i *)static_mask_1);\n"); + printf("\n"); + + printf("while (in <= keys) // <= because there can be a boundary case where the final key is 255*0 bit integers\n"); + printf("\t{\n"); + printf("\tswitch (*keys--)\n"); + printf("\t\t{\n"); + + for (instance = 0; instance <= 0xFF; instance++) + { + printf("\t\tcase 0x%02x:\n", instance); + if ((instance >> 4) == 0) + { + /* + 256 0-bit integers + */ + printf("#ifdef NO_ZEROS\n"); + printf("\t\t\ttmp = _mm_loadu_si128((__m128i *)static_mask_1);\n"); + printf("#else\n"); + printf("\t\t\ttmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp)));\n"); + printf("#endif\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 5, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 6, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 7, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 8, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 9, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 10, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 11, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 12, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 13, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 14, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 15, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 16, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 17, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 18, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 19, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 20, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 21, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 22, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 23, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 24, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 25, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 26, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 27, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 28, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 29, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 30, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 31, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 32, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 33, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 34, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 35, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 36, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 37, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 38, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 39, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 40, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 41, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 42, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 43, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 44, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 45, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 46, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 47, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 48, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 49, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 50, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 51, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 52, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 53, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 54, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 55, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 56, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 57, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 58, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 59, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 60, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 61, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 62, tmp);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 63, tmp);\n"); + printf("\t\t\tto += 256;\n"); // becomes 256 integers + } + else if (instance >> 4 == 1) + { + /* + 128 * 1-bit integers + */ + printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1));\n"); + + printf("\t\t\tin += 16;\n"); // 16 bytes + printf("\t\t\tto += 128;\n"); // becomes 128 integers + } + else if (instance >> 4 == 2) + { + /* + 64 * 2-bit integers + */ + printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 2);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2));\n"); + + printf("\t\t\tin += 16;\n"); // 16 bytes + printf("\t\t\tto += 64;\n"); // becomes 64 integers + } + else if (instance >> 4 == 3) + { + /* + 40 * 3-bit integers + */ + printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 3);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3));\n"); + + printf("\t\t\tin += 16;\n"); // 16 bytes + printf("\t\t\tto += 40;\n"); // becomes 40 integers + } + else if (instance >> 4 == 4) + { + /* + 32 * 4-bit integers + */ + printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 4);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 4);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 4);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 4);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 4);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 4);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 4);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4));\n"); + + printf("\t\t\tin += 16;\n"); // 16 bytes + printf("\t\t\tto += 32;\n"); // becomes 32 integers + } + else if (instance >> 4 == 5) + { + /* + 24 * 5-bit integers + */ + printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 5);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 5);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 5);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 5);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 5);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5));\n"); + + printf("\t\t\tin += 16;\n"); // 16 bytes + printf("\t\t\tto += 24;\n"); // becomes 24 integers + } + else if (instance >> 4 == 6) + { + /* + 20 * 6-bit integers + */ + printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 6);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 6);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 6);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 6);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6));\n"); + + printf("\t\t\tin += 16;\n"); // 16 bytes + printf("\t\t\tto += 20;\n"); // becomes 20 integers + } + else if (instance >> 4 == 7) + { + /* + 36 * 7 bit integers (in two 128-bit words) + */ + printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 7);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 7);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 7);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7));\n"); + + printf("\t\t\tbyte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream_2, 3);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 7);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 7);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 7);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7));\n"); + + printf("\t\t\tin += 32;\n"); // 32 bytes + printf("\t\t\tto += 36;\n"); // becomes 36 integers + } + else if (instance >> 4 == 8) + { + /* + 16 * 8-bit integers + */ + printf("\t\t\ttmp = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp));\n"); + printf("\t\t\ttmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2));\n"); + printf("\t\t\ttmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp));\n"); + printf("\t\t\ttmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01));\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2));\n"); + + printf("\t\t\tin += 16;\n"); // 16 bytes + printf("\t\t\tto += 16;\n"); // becomes 16 integers + } + else if (instance >> 4 == 9) + { + /* + 28 * 9-bit ingtegers (in two 128-bit words) + */ + printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 9);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 9);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9));\n"); + printf("\t\t\tbyte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream_2, 4);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 9);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 9);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9));\n"); + printf("\t\t\tin += 32;\n"); // 32 bytes + printf("\t\t\tto += 28;\n"); // becomes 28 integers + } + else if (instance >> 4 == 10) + { + /* + 12 * 10-bit integers + */ + printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 10);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi64(byte_stream, 10);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10));\n"); + + printf("\t\t\tin += 16;\n"); // 16 bytes + printf("\t\t\tto += 12;\n"); // becomes 12 integers + } + else if (instance >> 4 == 11) + { + /* + 20 * 12-bit ingtegers (in two 128-bit words) + */ + printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 12);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12));\n"); + printf("\t\t\tbyte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream_2, 8);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12));\n"); + printf("\t\t\tbyte_stream = _mm_srli_epi32(byte_stream, 12);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12));\n"); + + printf("\t\t\tin += 32;\n"); // 32 bytes + printf("\t\t\tto += 20;\n"); // becomes 20 integers + } + else if (instance >> 4 == 12) + { + /* + 16-bit integers + */ + printf("\t\t\ttmp = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp));\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));\n"); + + printf("\t\t\tin += 16;\n"); // 16 bytes + printf("\t\t\tto += 8;\n"); // becomes 8 integers + } + else if (instance >> 4 == 13) + { + /* + 12 * 21-bit ingtegers (in two 128-bit words) + */ + printf("\t\t\tbyte_stream = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21));\n"); + printf("\t\t\tbyte_stream_2 = _mm_loadu_si128((__m128i *)in + 1);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));\n"); + + printf("\t\t\tin += 32;\n"); // 32 bytes + printf("\t\t\tto += 12;\n"); // becomes 8 integers + } + else if (instance >> 4 == 14) + { + /* + 32-bit integers + */ + printf("\t\t\ttmp = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_storeu_si128((__m128i *)to, tmp);\n"); + + printf("\t\t\tin += 16;\n"); // 16 bytes + printf("\t\t\tto += 4;\n"); // becomes 4 integers + } + else if (instance >> 4 == 15) + { + /* + 128-bit integers + if there are fewer than 4 integes then we just bit-pack them in to 8, 16, 24, or 32-bit words + */ + if ((instance & 0x0C) == 0x00) + { + printf("\t\t\t*to = *(uint8_t *)in;\n"); + printf("\t\t\tin += 1;\n"); // 1 byte integer + printf("\t\t\tto += 1;\n"); // becomes 1 integer + } + else if ((instance & 0x0C) == 0x04) + { + printf("\t\t\t*to = *(uint16_t *)in;\n"); + printf("\t\t\tin += 2;\n"); // 2 byte integers + printf("\t\t\tto += 1;\n"); // becomes 1 integer + } + else if ((instance & 0x0C) == 0x08) + { + printf("\t\t\t*to = (*(uint8_t *)in << 16) | (*(uint8_t *)(in + 1) << 8) | (*(uint8_t *)(in + 2));\n"); + printf("\t\t\tin += 3;\n"); // 3 byte integer + printf("\t\t\tto += 1;\n"); // becomes 1 integer + } + else if ((instance & 0x0C) == 0x0C) + { + printf("\t\t\t*to = *(uint32_t *)in;\n"); + printf("\t\t\tin += 4;\n"); // 4 byte integer + printf("\t\t\tto += 1;\n"); // becomes 1 integer + } + if (instance == 0xFF || instance == 0xFB || instance == 0xF7 || instance == 0xF3) + printf("\t\t\tbreak;\n"); + } + else + { + printf("\t\t\tin++;\n"); // dummy, can't occur + } + if ((instance & 0xF) == 0xF) + printf("\t\t\tbreak;\n"); // every 32 instances we break (its the end of the fall through) + } + printf("\t\t}\n"); + printf("\t}\n"); + printf("}\n"); + } +#endif + +#ifdef TEST_ONE_STRING + static uint32_t sequence[]={13,1,1,26,18,3,1,9,4,8,5,19,7,26,1,5,7,3,12,5,39,16,3,5,19,8,18,1,1,1,2,5,9,3,21,2,6,37,3,5,5,18,3,31,3,22,5,17,6,12,6,2,5,10,3,12,51,14,7,8,1,2,3,27,19,1,10,8,2,7,2,9,16,6,6,5,6,4,18,21,13,2,1,11,3,22,2,16,13,61,21,12,51,10,6,31,14,65,15,82,5,4,18,3,1,1,4,34,5,9,4,7,1,25,17,52,60,8,8,4,22,7,49,26,2,72,29,33,6,11,3,8,1,23,37,1,3,1,1,1,3,20,6,1,2,1,1,1,14,2,4,1,6,4,4,3,1,1,2,2,1,9,29,1,10,11,4,10,31}; + + static uint32_t second_compress_buffer[100000]; + static uint32_t second_decompress_buffer[100000]; + + uint32_t second_compress_buffer_size = sizeof(second_compress_buffer) / sizeof(*second_compress_buffer); + uint32_t second_decompress_buffer_size = sizeof(second_decompress_buffer) / sizeof(*second_decompress_buffer); + + /* + CHECK() + ------- + */ + void check(uint32_t *sequence, uint32_t sequence_length) + { + ANT_compress_qmx_v2 compressor; + uint64_t buffer_size; + uint32_t pos; + uint32_t fail; + + memset(second_compress_buffer, 0, second_compress_buffer_size); + memset(second_decompress_buffer, 0, second_decompress_buffer_size); + + compressor.encodeArray(sequence, sequence_length, (uint32_t *)second_compress_buffer, &buffer_size); + second_compress_buffer[buffer_size] = 0; + second_compress_buffer[buffer_size + 1] = 0; + second_compress_buffer[buffer_size + 2] = 0; + second_compress_buffer[buffer_size + 3] = 0; + + for (pos = 0; pos < buffer_size; pos++) + printf("%02X ", ((uint8_t *)second_compress_buffer)[pos]); + puts(""); + + compressor.decodeArray((uint32_t *)second_compress_buffer, buffer_size, (uint32_t *)second_decompress_buffer, sequence_length); + + fail = false; + for (pos = 0; pos < sequence_length; pos++) + if (sequence[pos] != second_decompress_buffer[pos]) + { + printf("p[%d]:%X != %X\n", pos, sequence[pos], second_decompress_buffer[pos]); + fail = true; + } + else + printf("p[%d]:%X == %X\n", pos, sequence[pos], second_decompress_buffer[pos]); + + if (fail) + puts("Test failed"); + else + puts("Test succeeded"); + } + + /* + MAIN() + ------ + */ + int main(void) + { + check(sequence, sizeof(sequence) / sizeof(*sequence)); + } +#endif +/* + ANT_COMPRESS_QMX_V2::DECODEARRAY() + -------------------------------- + this code was generated by the method above. +*/ +#include "compress_qmx_v2_decompress.cpp" diff --git a/ext/bench_/bench/compress_qmx_v2.h b/ext/bench_/bench/compress_qmx_v2.h new file mode 100644 index 0000000..35f7939 --- /dev/null +++ b/ext/bench_/bench/compress_qmx_v2.h @@ -0,0 +1,46 @@ +/* + COMPRESS_QMX_V2.H + ------------------ + QMX with: + no overflow (Matt's changes) + no VB lengths (backwards scanning selectors) +*/ +#ifndef COMPRESS_QMX_V2_H_ +#define COMPRESS_QMX_V2_H_ + +#include +#include "compress.h" + +/* + class ANT_COMPRESS_QMX_V2 + ------------------------- +*/ +class ANT_compress_qmx_v2 : public ANT_compress + { + private: + uint8_t *length_buffer; + uint64_t length_buffer_length; + + public: + ANT_compress_qmx_v2(); + virtual ~ANT_compress_qmx_v2(); + + void encodeArray(const uint32_t *in, uint64_t len, uint32_t *out, uint64_t *nvalue); + static void decodeArray(const uint32_t *in, uint64_t len, uint32_t *out, uint64_t nvalue); + + virtual uint64_t compress(uint8_t *destination, uint64_t destination_length, uint32_t *source, uint64_t source_integers) + { + uint64_t answer; + encodeArray(source, source_integers, (uint32_t *)destination, &answer); + return answer; + } + + virtual void decompress(uint32_t *destination, uint64_t destinaton_integers, uint8_t *source, uint64_t source_length) + { + decodeArray((uint32_t *)source, source_length, destination, destinaton_integers); + } + } ; + +#endif + + diff --git a/ext/bench_/bench/compress_qmx_v2_decompress.cpp b/ext/bench_/bench/compress_qmx_v2_decompress.cpp new file mode 100644 index 0000000..a3a4d9e --- /dev/null +++ b/ext/bench_/bench/compress_qmx_v2_decompress.cpp @@ -0,0 +1,5448 @@ +static uint32_t ALIGN_16 static_mask_21[] = {0x1fffff, 0x1fffff, 0x1fffff, 0x1fffff}; +static uint32_t ALIGN_16 static_mask_12[] = {0xfff, 0xfff, 0xfff, 0xfff}; +static uint32_t ALIGN_16 static_mask_10[] = {0x3ff, 0x3ff, 0x3ff, 0x3ff}; +static uint32_t ALIGN_16 static_mask_9[] = {0x1ff, 0x1ff, 0x1ff, 0x1ff}; +static uint32_t ALIGN_16 static_mask_7[] = {0x7f, 0x7f, 0x7f, 0x7f}; +static uint32_t ALIGN_16 static_mask_6[] = {0x3f, 0x3f, 0x3f, 0x3f}; +static uint32_t ALIGN_16 static_mask_5[] = {0x1f, 0x1f, 0x1f, 0x1f}; +static uint32_t ALIGN_16 static_mask_4[] = {0x0f, 0x0f, 0x0f, 0x0f}; +static uint32_t ALIGN_16 static_mask_3[] = {0x07, 0x07, 0x07, 0x07}; +static uint32_t ALIGN_16 static_mask_2[] = {0x03, 0x03, 0x03, 0x03}; +static uint32_t ALIGN_16 static_mask_1[] = {0x01, 0x01, 0x01, 0x01}; +void ANT_compress_qmx_v2::decodeArray(const uint32_t *source, uint64_t len, uint32_t *to, uint64_t destination_integers) +{ +__m128i byte_stream, byte_stream_2, tmp, tmp2, mask_21, mask_12, mask_10, mask_9, mask_7, mask_6, mask_5, mask_4, mask_3, mask_2, mask_1; +uint8_t *in = (uint8_t *)source; +uint8_t *keys = ((uint8_t *)source) + len - 1; + +mask_21 = _mm_loadu_si128((__m128i *)static_mask_21); +mask_12 = _mm_loadu_si128((__m128i *)static_mask_12); +mask_10 = _mm_loadu_si128((__m128i *)static_mask_10); +mask_9 = _mm_loadu_si128((__m128i *)static_mask_9); +mask_7 = _mm_loadu_si128((__m128i *)static_mask_7); +mask_6 = _mm_loadu_si128((__m128i *)static_mask_6); +mask_5 = _mm_loadu_si128((__m128i *)static_mask_5); +mask_4 = _mm_loadu_si128((__m128i *)static_mask_4); +mask_3 = _mm_loadu_si128((__m128i *)static_mask_3); +mask_2 = _mm_loadu_si128((__m128i *)static_mask_2); +mask_1 = _mm_loadu_si128((__m128i *)static_mask_1); + +while (in <= keys) // <= because there can be a boundary case where the final key is 255*0 bit integers + { + switch (*keys--) + { + case 0x00: +#ifdef NO_ZEROS + tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_storeu_si128((__m128i *)to, tmp); + _mm_storeu_si128((__m128i *)to + 1, tmp); + _mm_storeu_si128((__m128i *)to + 2, tmp); + _mm_storeu_si128((__m128i *)to + 3, tmp); + _mm_storeu_si128((__m128i *)to + 4, tmp); + _mm_storeu_si128((__m128i *)to + 5, tmp); + _mm_storeu_si128((__m128i *)to + 6, tmp); + _mm_storeu_si128((__m128i *)to + 7, tmp); + _mm_storeu_si128((__m128i *)to + 8, tmp); + _mm_storeu_si128((__m128i *)to + 9, tmp); + _mm_storeu_si128((__m128i *)to + 10, tmp); + _mm_storeu_si128((__m128i *)to + 11, tmp); + _mm_storeu_si128((__m128i *)to + 12, tmp); + _mm_storeu_si128((__m128i *)to + 13, tmp); + _mm_storeu_si128((__m128i *)to + 14, tmp); + _mm_storeu_si128((__m128i *)to + 15, tmp); + _mm_storeu_si128((__m128i *)to + 16, tmp); + _mm_storeu_si128((__m128i *)to + 17, tmp); + _mm_storeu_si128((__m128i *)to + 18, tmp); + _mm_storeu_si128((__m128i *)to + 19, tmp); + _mm_storeu_si128((__m128i *)to + 20, tmp); + _mm_storeu_si128((__m128i *)to + 21, tmp); + _mm_storeu_si128((__m128i *)to + 22, tmp); + _mm_storeu_si128((__m128i *)to + 23, tmp); + _mm_storeu_si128((__m128i *)to + 24, tmp); + _mm_storeu_si128((__m128i *)to + 25, tmp); + _mm_storeu_si128((__m128i *)to + 26, tmp); + _mm_storeu_si128((__m128i *)to + 27, tmp); + _mm_storeu_si128((__m128i *)to + 28, tmp); + _mm_storeu_si128((__m128i *)to + 29, tmp); + _mm_storeu_si128((__m128i *)to + 30, tmp); + _mm_storeu_si128((__m128i *)to + 31, tmp); + _mm_storeu_si128((__m128i *)to + 32, tmp); + _mm_storeu_si128((__m128i *)to + 33, tmp); + _mm_storeu_si128((__m128i *)to + 34, tmp); + _mm_storeu_si128((__m128i *)to + 35, tmp); + _mm_storeu_si128((__m128i *)to + 36, tmp); + _mm_storeu_si128((__m128i *)to + 37, tmp); + _mm_storeu_si128((__m128i *)to + 38, tmp); + _mm_storeu_si128((__m128i *)to + 39, tmp); + _mm_storeu_si128((__m128i *)to + 40, tmp); + _mm_storeu_si128((__m128i *)to + 41, tmp); + _mm_storeu_si128((__m128i *)to + 42, tmp); + _mm_storeu_si128((__m128i *)to + 43, tmp); + _mm_storeu_si128((__m128i *)to + 44, tmp); + _mm_storeu_si128((__m128i *)to + 45, tmp); + _mm_storeu_si128((__m128i *)to + 46, tmp); + _mm_storeu_si128((__m128i *)to + 47, tmp); + _mm_storeu_si128((__m128i *)to + 48, tmp); + _mm_storeu_si128((__m128i *)to + 49, tmp); + _mm_storeu_si128((__m128i *)to + 50, tmp); + _mm_storeu_si128((__m128i *)to + 51, tmp); + _mm_storeu_si128((__m128i *)to + 52, tmp); + _mm_storeu_si128((__m128i *)to + 53, tmp); + _mm_storeu_si128((__m128i *)to + 54, tmp); + _mm_storeu_si128((__m128i *)to + 55, tmp); + _mm_storeu_si128((__m128i *)to + 56, tmp); + _mm_storeu_si128((__m128i *)to + 57, tmp); + _mm_storeu_si128((__m128i *)to + 58, tmp); + _mm_storeu_si128((__m128i *)to + 59, tmp); + _mm_storeu_si128((__m128i *)to + 60, tmp); + _mm_storeu_si128((__m128i *)to + 61, tmp); + _mm_storeu_si128((__m128i *)to + 62, tmp); + _mm_storeu_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x01: +#ifdef NO_ZEROS + tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_storeu_si128((__m128i *)to, tmp); + _mm_storeu_si128((__m128i *)to + 1, tmp); + _mm_storeu_si128((__m128i *)to + 2, tmp); + _mm_storeu_si128((__m128i *)to + 3, tmp); + _mm_storeu_si128((__m128i *)to + 4, tmp); + _mm_storeu_si128((__m128i *)to + 5, tmp); + _mm_storeu_si128((__m128i *)to + 6, tmp); + _mm_storeu_si128((__m128i *)to + 7, tmp); + _mm_storeu_si128((__m128i *)to + 8, tmp); + _mm_storeu_si128((__m128i *)to + 9, tmp); + _mm_storeu_si128((__m128i *)to + 10, tmp); + _mm_storeu_si128((__m128i *)to + 11, tmp); + _mm_storeu_si128((__m128i *)to + 12, tmp); + _mm_storeu_si128((__m128i *)to + 13, tmp); + _mm_storeu_si128((__m128i *)to + 14, tmp); + _mm_storeu_si128((__m128i *)to + 15, tmp); + _mm_storeu_si128((__m128i *)to + 16, tmp); + _mm_storeu_si128((__m128i *)to + 17, tmp); + _mm_storeu_si128((__m128i *)to + 18, tmp); + _mm_storeu_si128((__m128i *)to + 19, tmp); + _mm_storeu_si128((__m128i *)to + 20, tmp); + _mm_storeu_si128((__m128i *)to + 21, tmp); + _mm_storeu_si128((__m128i *)to + 22, tmp); + _mm_storeu_si128((__m128i *)to + 23, tmp); + _mm_storeu_si128((__m128i *)to + 24, tmp); + _mm_storeu_si128((__m128i *)to + 25, tmp); + _mm_storeu_si128((__m128i *)to + 26, tmp); + _mm_storeu_si128((__m128i *)to + 27, tmp); + _mm_storeu_si128((__m128i *)to + 28, tmp); + _mm_storeu_si128((__m128i *)to + 29, tmp); + _mm_storeu_si128((__m128i *)to + 30, tmp); + _mm_storeu_si128((__m128i *)to + 31, tmp); + _mm_storeu_si128((__m128i *)to + 32, tmp); + _mm_storeu_si128((__m128i *)to + 33, tmp); + _mm_storeu_si128((__m128i *)to + 34, tmp); + _mm_storeu_si128((__m128i *)to + 35, tmp); + _mm_storeu_si128((__m128i *)to + 36, tmp); + _mm_storeu_si128((__m128i *)to + 37, tmp); + _mm_storeu_si128((__m128i *)to + 38, tmp); + _mm_storeu_si128((__m128i *)to + 39, tmp); + _mm_storeu_si128((__m128i *)to + 40, tmp); + _mm_storeu_si128((__m128i *)to + 41, tmp); + _mm_storeu_si128((__m128i *)to + 42, tmp); + _mm_storeu_si128((__m128i *)to + 43, tmp); + _mm_storeu_si128((__m128i *)to + 44, tmp); + _mm_storeu_si128((__m128i *)to + 45, tmp); + _mm_storeu_si128((__m128i *)to + 46, tmp); + _mm_storeu_si128((__m128i *)to + 47, tmp); + _mm_storeu_si128((__m128i *)to + 48, tmp); + _mm_storeu_si128((__m128i *)to + 49, tmp); + _mm_storeu_si128((__m128i *)to + 50, tmp); + _mm_storeu_si128((__m128i *)to + 51, tmp); + _mm_storeu_si128((__m128i *)to + 52, tmp); + _mm_storeu_si128((__m128i *)to + 53, tmp); + _mm_storeu_si128((__m128i *)to + 54, tmp); + _mm_storeu_si128((__m128i *)to + 55, tmp); + _mm_storeu_si128((__m128i *)to + 56, tmp); + _mm_storeu_si128((__m128i *)to + 57, tmp); + _mm_storeu_si128((__m128i *)to + 58, tmp); + _mm_storeu_si128((__m128i *)to + 59, tmp); + _mm_storeu_si128((__m128i *)to + 60, tmp); + _mm_storeu_si128((__m128i *)to + 61, tmp); + _mm_storeu_si128((__m128i *)to + 62, tmp); + _mm_storeu_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x02: +#ifdef NO_ZEROS + tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_storeu_si128((__m128i *)to, tmp); + _mm_storeu_si128((__m128i *)to + 1, tmp); + _mm_storeu_si128((__m128i *)to + 2, tmp); + _mm_storeu_si128((__m128i *)to + 3, tmp); + _mm_storeu_si128((__m128i *)to + 4, tmp); + _mm_storeu_si128((__m128i *)to + 5, tmp); + _mm_storeu_si128((__m128i *)to + 6, tmp); + _mm_storeu_si128((__m128i *)to + 7, tmp); + _mm_storeu_si128((__m128i *)to + 8, tmp); + _mm_storeu_si128((__m128i *)to + 9, tmp); + _mm_storeu_si128((__m128i *)to + 10, tmp); + _mm_storeu_si128((__m128i *)to + 11, tmp); + _mm_storeu_si128((__m128i *)to + 12, tmp); + _mm_storeu_si128((__m128i *)to + 13, tmp); + _mm_storeu_si128((__m128i *)to + 14, tmp); + _mm_storeu_si128((__m128i *)to + 15, tmp); + _mm_storeu_si128((__m128i *)to + 16, tmp); + _mm_storeu_si128((__m128i *)to + 17, tmp); + _mm_storeu_si128((__m128i *)to + 18, tmp); + _mm_storeu_si128((__m128i *)to + 19, tmp); + _mm_storeu_si128((__m128i *)to + 20, tmp); + _mm_storeu_si128((__m128i *)to + 21, tmp); + _mm_storeu_si128((__m128i *)to + 22, tmp); + _mm_storeu_si128((__m128i *)to + 23, tmp); + _mm_storeu_si128((__m128i *)to + 24, tmp); + _mm_storeu_si128((__m128i *)to + 25, tmp); + _mm_storeu_si128((__m128i *)to + 26, tmp); + _mm_storeu_si128((__m128i *)to + 27, tmp); + _mm_storeu_si128((__m128i *)to + 28, tmp); + _mm_storeu_si128((__m128i *)to + 29, tmp); + _mm_storeu_si128((__m128i *)to + 30, tmp); + _mm_storeu_si128((__m128i *)to + 31, tmp); + _mm_storeu_si128((__m128i *)to + 32, tmp); + _mm_storeu_si128((__m128i *)to + 33, tmp); + _mm_storeu_si128((__m128i *)to + 34, tmp); + _mm_storeu_si128((__m128i *)to + 35, tmp); + _mm_storeu_si128((__m128i *)to + 36, tmp); + _mm_storeu_si128((__m128i *)to + 37, tmp); + _mm_storeu_si128((__m128i *)to + 38, tmp); + _mm_storeu_si128((__m128i *)to + 39, tmp); + _mm_storeu_si128((__m128i *)to + 40, tmp); + _mm_storeu_si128((__m128i *)to + 41, tmp); + _mm_storeu_si128((__m128i *)to + 42, tmp); + _mm_storeu_si128((__m128i *)to + 43, tmp); + _mm_storeu_si128((__m128i *)to + 44, tmp); + _mm_storeu_si128((__m128i *)to + 45, tmp); + _mm_storeu_si128((__m128i *)to + 46, tmp); + _mm_storeu_si128((__m128i *)to + 47, tmp); + _mm_storeu_si128((__m128i *)to + 48, tmp); + _mm_storeu_si128((__m128i *)to + 49, tmp); + _mm_storeu_si128((__m128i *)to + 50, tmp); + _mm_storeu_si128((__m128i *)to + 51, tmp); + _mm_storeu_si128((__m128i *)to + 52, tmp); + _mm_storeu_si128((__m128i *)to + 53, tmp); + _mm_storeu_si128((__m128i *)to + 54, tmp); + _mm_storeu_si128((__m128i *)to + 55, tmp); + _mm_storeu_si128((__m128i *)to + 56, tmp); + _mm_storeu_si128((__m128i *)to + 57, tmp); + _mm_storeu_si128((__m128i *)to + 58, tmp); + _mm_storeu_si128((__m128i *)to + 59, tmp); + _mm_storeu_si128((__m128i *)to + 60, tmp); + _mm_storeu_si128((__m128i *)to + 61, tmp); + _mm_storeu_si128((__m128i *)to + 62, tmp); + _mm_storeu_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x03: +#ifdef NO_ZEROS + tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_storeu_si128((__m128i *)to, tmp); + _mm_storeu_si128((__m128i *)to + 1, tmp); + _mm_storeu_si128((__m128i *)to + 2, tmp); + _mm_storeu_si128((__m128i *)to + 3, tmp); + _mm_storeu_si128((__m128i *)to + 4, tmp); + _mm_storeu_si128((__m128i *)to + 5, tmp); + _mm_storeu_si128((__m128i *)to + 6, tmp); + _mm_storeu_si128((__m128i *)to + 7, tmp); + _mm_storeu_si128((__m128i *)to + 8, tmp); + _mm_storeu_si128((__m128i *)to + 9, tmp); + _mm_storeu_si128((__m128i *)to + 10, tmp); + _mm_storeu_si128((__m128i *)to + 11, tmp); + _mm_storeu_si128((__m128i *)to + 12, tmp); + _mm_storeu_si128((__m128i *)to + 13, tmp); + _mm_storeu_si128((__m128i *)to + 14, tmp); + _mm_storeu_si128((__m128i *)to + 15, tmp); + _mm_storeu_si128((__m128i *)to + 16, tmp); + _mm_storeu_si128((__m128i *)to + 17, tmp); + _mm_storeu_si128((__m128i *)to + 18, tmp); + _mm_storeu_si128((__m128i *)to + 19, tmp); + _mm_storeu_si128((__m128i *)to + 20, tmp); + _mm_storeu_si128((__m128i *)to + 21, tmp); + _mm_storeu_si128((__m128i *)to + 22, tmp); + _mm_storeu_si128((__m128i *)to + 23, tmp); + _mm_storeu_si128((__m128i *)to + 24, tmp); + _mm_storeu_si128((__m128i *)to + 25, tmp); + _mm_storeu_si128((__m128i *)to + 26, tmp); + _mm_storeu_si128((__m128i *)to + 27, tmp); + _mm_storeu_si128((__m128i *)to + 28, tmp); + _mm_storeu_si128((__m128i *)to + 29, tmp); + _mm_storeu_si128((__m128i *)to + 30, tmp); + _mm_storeu_si128((__m128i *)to + 31, tmp); + _mm_storeu_si128((__m128i *)to + 32, tmp); + _mm_storeu_si128((__m128i *)to + 33, tmp); + _mm_storeu_si128((__m128i *)to + 34, tmp); + _mm_storeu_si128((__m128i *)to + 35, tmp); + _mm_storeu_si128((__m128i *)to + 36, tmp); + _mm_storeu_si128((__m128i *)to + 37, tmp); + _mm_storeu_si128((__m128i *)to + 38, tmp); + _mm_storeu_si128((__m128i *)to + 39, tmp); + _mm_storeu_si128((__m128i *)to + 40, tmp); + _mm_storeu_si128((__m128i *)to + 41, tmp); + _mm_storeu_si128((__m128i *)to + 42, tmp); + _mm_storeu_si128((__m128i *)to + 43, tmp); + _mm_storeu_si128((__m128i *)to + 44, tmp); + _mm_storeu_si128((__m128i *)to + 45, tmp); + _mm_storeu_si128((__m128i *)to + 46, tmp); + _mm_storeu_si128((__m128i *)to + 47, tmp); + _mm_storeu_si128((__m128i *)to + 48, tmp); + _mm_storeu_si128((__m128i *)to + 49, tmp); + _mm_storeu_si128((__m128i *)to + 50, tmp); + _mm_storeu_si128((__m128i *)to + 51, tmp); + _mm_storeu_si128((__m128i *)to + 52, tmp); + _mm_storeu_si128((__m128i *)to + 53, tmp); + _mm_storeu_si128((__m128i *)to + 54, tmp); + _mm_storeu_si128((__m128i *)to + 55, tmp); + _mm_storeu_si128((__m128i *)to + 56, tmp); + _mm_storeu_si128((__m128i *)to + 57, tmp); + _mm_storeu_si128((__m128i *)to + 58, tmp); + _mm_storeu_si128((__m128i *)to + 59, tmp); + _mm_storeu_si128((__m128i *)to + 60, tmp); + _mm_storeu_si128((__m128i *)to + 61, tmp); + _mm_storeu_si128((__m128i *)to + 62, tmp); + _mm_storeu_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x04: +#ifdef NO_ZEROS + tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_storeu_si128((__m128i *)to, tmp); + _mm_storeu_si128((__m128i *)to + 1, tmp); + _mm_storeu_si128((__m128i *)to + 2, tmp); + _mm_storeu_si128((__m128i *)to + 3, tmp); + _mm_storeu_si128((__m128i *)to + 4, tmp); + _mm_storeu_si128((__m128i *)to + 5, tmp); + _mm_storeu_si128((__m128i *)to + 6, tmp); + _mm_storeu_si128((__m128i *)to + 7, tmp); + _mm_storeu_si128((__m128i *)to + 8, tmp); + _mm_storeu_si128((__m128i *)to + 9, tmp); + _mm_storeu_si128((__m128i *)to + 10, tmp); + _mm_storeu_si128((__m128i *)to + 11, tmp); + _mm_storeu_si128((__m128i *)to + 12, tmp); + _mm_storeu_si128((__m128i *)to + 13, tmp); + _mm_storeu_si128((__m128i *)to + 14, tmp); + _mm_storeu_si128((__m128i *)to + 15, tmp); + _mm_storeu_si128((__m128i *)to + 16, tmp); + _mm_storeu_si128((__m128i *)to + 17, tmp); + _mm_storeu_si128((__m128i *)to + 18, tmp); + _mm_storeu_si128((__m128i *)to + 19, tmp); + _mm_storeu_si128((__m128i *)to + 20, tmp); + _mm_storeu_si128((__m128i *)to + 21, tmp); + _mm_storeu_si128((__m128i *)to + 22, tmp); + _mm_storeu_si128((__m128i *)to + 23, tmp); + _mm_storeu_si128((__m128i *)to + 24, tmp); + _mm_storeu_si128((__m128i *)to + 25, tmp); + _mm_storeu_si128((__m128i *)to + 26, tmp); + _mm_storeu_si128((__m128i *)to + 27, tmp); + _mm_storeu_si128((__m128i *)to + 28, tmp); + _mm_storeu_si128((__m128i *)to + 29, tmp); + _mm_storeu_si128((__m128i *)to + 30, tmp); + _mm_storeu_si128((__m128i *)to + 31, tmp); + _mm_storeu_si128((__m128i *)to + 32, tmp); + _mm_storeu_si128((__m128i *)to + 33, tmp); + _mm_storeu_si128((__m128i *)to + 34, tmp); + _mm_storeu_si128((__m128i *)to + 35, tmp); + _mm_storeu_si128((__m128i *)to + 36, tmp); + _mm_storeu_si128((__m128i *)to + 37, tmp); + _mm_storeu_si128((__m128i *)to + 38, tmp); + _mm_storeu_si128((__m128i *)to + 39, tmp); + _mm_storeu_si128((__m128i *)to + 40, tmp); + _mm_storeu_si128((__m128i *)to + 41, tmp); + _mm_storeu_si128((__m128i *)to + 42, tmp); + _mm_storeu_si128((__m128i *)to + 43, tmp); + _mm_storeu_si128((__m128i *)to + 44, tmp); + _mm_storeu_si128((__m128i *)to + 45, tmp); + _mm_storeu_si128((__m128i *)to + 46, tmp); + _mm_storeu_si128((__m128i *)to + 47, tmp); + _mm_storeu_si128((__m128i *)to + 48, tmp); + _mm_storeu_si128((__m128i *)to + 49, tmp); + _mm_storeu_si128((__m128i *)to + 50, tmp); + _mm_storeu_si128((__m128i *)to + 51, tmp); + _mm_storeu_si128((__m128i *)to + 52, tmp); + _mm_storeu_si128((__m128i *)to + 53, tmp); + _mm_storeu_si128((__m128i *)to + 54, tmp); + _mm_storeu_si128((__m128i *)to + 55, tmp); + _mm_storeu_si128((__m128i *)to + 56, tmp); + _mm_storeu_si128((__m128i *)to + 57, tmp); + _mm_storeu_si128((__m128i *)to + 58, tmp); + _mm_storeu_si128((__m128i *)to + 59, tmp); + _mm_storeu_si128((__m128i *)to + 60, tmp); + _mm_storeu_si128((__m128i *)to + 61, tmp); + _mm_storeu_si128((__m128i *)to + 62, tmp); + _mm_storeu_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x05: +#ifdef NO_ZEROS + tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_storeu_si128((__m128i *)to, tmp); + _mm_storeu_si128((__m128i *)to + 1, tmp); + _mm_storeu_si128((__m128i *)to + 2, tmp); + _mm_storeu_si128((__m128i *)to + 3, tmp); + _mm_storeu_si128((__m128i *)to + 4, tmp); + _mm_storeu_si128((__m128i *)to + 5, tmp); + _mm_storeu_si128((__m128i *)to + 6, tmp); + _mm_storeu_si128((__m128i *)to + 7, tmp); + _mm_storeu_si128((__m128i *)to + 8, tmp); + _mm_storeu_si128((__m128i *)to + 9, tmp); + _mm_storeu_si128((__m128i *)to + 10, tmp); + _mm_storeu_si128((__m128i *)to + 11, tmp); + _mm_storeu_si128((__m128i *)to + 12, tmp); + _mm_storeu_si128((__m128i *)to + 13, tmp); + _mm_storeu_si128((__m128i *)to + 14, tmp); + _mm_storeu_si128((__m128i *)to + 15, tmp); + _mm_storeu_si128((__m128i *)to + 16, tmp); + _mm_storeu_si128((__m128i *)to + 17, tmp); + _mm_storeu_si128((__m128i *)to + 18, tmp); + _mm_storeu_si128((__m128i *)to + 19, tmp); + _mm_storeu_si128((__m128i *)to + 20, tmp); + _mm_storeu_si128((__m128i *)to + 21, tmp); + _mm_storeu_si128((__m128i *)to + 22, tmp); + _mm_storeu_si128((__m128i *)to + 23, tmp); + _mm_storeu_si128((__m128i *)to + 24, tmp); + _mm_storeu_si128((__m128i *)to + 25, tmp); + _mm_storeu_si128((__m128i *)to + 26, tmp); + _mm_storeu_si128((__m128i *)to + 27, tmp); + _mm_storeu_si128((__m128i *)to + 28, tmp); + _mm_storeu_si128((__m128i *)to + 29, tmp); + _mm_storeu_si128((__m128i *)to + 30, tmp); + _mm_storeu_si128((__m128i *)to + 31, tmp); + _mm_storeu_si128((__m128i *)to + 32, tmp); + _mm_storeu_si128((__m128i *)to + 33, tmp); + _mm_storeu_si128((__m128i *)to + 34, tmp); + _mm_storeu_si128((__m128i *)to + 35, tmp); + _mm_storeu_si128((__m128i *)to + 36, tmp); + _mm_storeu_si128((__m128i *)to + 37, tmp); + _mm_storeu_si128((__m128i *)to + 38, tmp); + _mm_storeu_si128((__m128i *)to + 39, tmp); + _mm_storeu_si128((__m128i *)to + 40, tmp); + _mm_storeu_si128((__m128i *)to + 41, tmp); + _mm_storeu_si128((__m128i *)to + 42, tmp); + _mm_storeu_si128((__m128i *)to + 43, tmp); + _mm_storeu_si128((__m128i *)to + 44, tmp); + _mm_storeu_si128((__m128i *)to + 45, tmp); + _mm_storeu_si128((__m128i *)to + 46, tmp); + _mm_storeu_si128((__m128i *)to + 47, tmp); + _mm_storeu_si128((__m128i *)to + 48, tmp); + _mm_storeu_si128((__m128i *)to + 49, tmp); + _mm_storeu_si128((__m128i *)to + 50, tmp); + _mm_storeu_si128((__m128i *)to + 51, tmp); + _mm_storeu_si128((__m128i *)to + 52, tmp); + _mm_storeu_si128((__m128i *)to + 53, tmp); + _mm_storeu_si128((__m128i *)to + 54, tmp); + _mm_storeu_si128((__m128i *)to + 55, tmp); + _mm_storeu_si128((__m128i *)to + 56, tmp); + _mm_storeu_si128((__m128i *)to + 57, tmp); + _mm_storeu_si128((__m128i *)to + 58, tmp); + _mm_storeu_si128((__m128i *)to + 59, tmp); + _mm_storeu_si128((__m128i *)to + 60, tmp); + _mm_storeu_si128((__m128i *)to + 61, tmp); + _mm_storeu_si128((__m128i *)to + 62, tmp); + _mm_storeu_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x06: +#ifdef NO_ZEROS + tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_storeu_si128((__m128i *)to, tmp); + _mm_storeu_si128((__m128i *)to + 1, tmp); + _mm_storeu_si128((__m128i *)to + 2, tmp); + _mm_storeu_si128((__m128i *)to + 3, tmp); + _mm_storeu_si128((__m128i *)to + 4, tmp); + _mm_storeu_si128((__m128i *)to + 5, tmp); + _mm_storeu_si128((__m128i *)to + 6, tmp); + _mm_storeu_si128((__m128i *)to + 7, tmp); + _mm_storeu_si128((__m128i *)to + 8, tmp); + _mm_storeu_si128((__m128i *)to + 9, tmp); + _mm_storeu_si128((__m128i *)to + 10, tmp); + _mm_storeu_si128((__m128i *)to + 11, tmp); + _mm_storeu_si128((__m128i *)to + 12, tmp); + _mm_storeu_si128((__m128i *)to + 13, tmp); + _mm_storeu_si128((__m128i *)to + 14, tmp); + _mm_storeu_si128((__m128i *)to + 15, tmp); + _mm_storeu_si128((__m128i *)to + 16, tmp); + _mm_storeu_si128((__m128i *)to + 17, tmp); + _mm_storeu_si128((__m128i *)to + 18, tmp); + _mm_storeu_si128((__m128i *)to + 19, tmp); + _mm_storeu_si128((__m128i *)to + 20, tmp); + _mm_storeu_si128((__m128i *)to + 21, tmp); + _mm_storeu_si128((__m128i *)to + 22, tmp); + _mm_storeu_si128((__m128i *)to + 23, tmp); + _mm_storeu_si128((__m128i *)to + 24, tmp); + _mm_storeu_si128((__m128i *)to + 25, tmp); + _mm_storeu_si128((__m128i *)to + 26, tmp); + _mm_storeu_si128((__m128i *)to + 27, tmp); + _mm_storeu_si128((__m128i *)to + 28, tmp); + _mm_storeu_si128((__m128i *)to + 29, tmp); + _mm_storeu_si128((__m128i *)to + 30, tmp); + _mm_storeu_si128((__m128i *)to + 31, tmp); + _mm_storeu_si128((__m128i *)to + 32, tmp); + _mm_storeu_si128((__m128i *)to + 33, tmp); + _mm_storeu_si128((__m128i *)to + 34, tmp); + _mm_storeu_si128((__m128i *)to + 35, tmp); + _mm_storeu_si128((__m128i *)to + 36, tmp); + _mm_storeu_si128((__m128i *)to + 37, tmp); + _mm_storeu_si128((__m128i *)to + 38, tmp); + _mm_storeu_si128((__m128i *)to + 39, tmp); + _mm_storeu_si128((__m128i *)to + 40, tmp); + _mm_storeu_si128((__m128i *)to + 41, tmp); + _mm_storeu_si128((__m128i *)to + 42, tmp); + _mm_storeu_si128((__m128i *)to + 43, tmp); + _mm_storeu_si128((__m128i *)to + 44, tmp); + _mm_storeu_si128((__m128i *)to + 45, tmp); + _mm_storeu_si128((__m128i *)to + 46, tmp); + _mm_storeu_si128((__m128i *)to + 47, tmp); + _mm_storeu_si128((__m128i *)to + 48, tmp); + _mm_storeu_si128((__m128i *)to + 49, tmp); + _mm_storeu_si128((__m128i *)to + 50, tmp); + _mm_storeu_si128((__m128i *)to + 51, tmp); + _mm_storeu_si128((__m128i *)to + 52, tmp); + _mm_storeu_si128((__m128i *)to + 53, tmp); + _mm_storeu_si128((__m128i *)to + 54, tmp); + _mm_storeu_si128((__m128i *)to + 55, tmp); + _mm_storeu_si128((__m128i *)to + 56, tmp); + _mm_storeu_si128((__m128i *)to + 57, tmp); + _mm_storeu_si128((__m128i *)to + 58, tmp); + _mm_storeu_si128((__m128i *)to + 59, tmp); + _mm_storeu_si128((__m128i *)to + 60, tmp); + _mm_storeu_si128((__m128i *)to + 61, tmp); + _mm_storeu_si128((__m128i *)to + 62, tmp); + _mm_storeu_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x07: +#ifdef NO_ZEROS + tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_storeu_si128((__m128i *)to, tmp); + _mm_storeu_si128((__m128i *)to + 1, tmp); + _mm_storeu_si128((__m128i *)to + 2, tmp); + _mm_storeu_si128((__m128i *)to + 3, tmp); + _mm_storeu_si128((__m128i *)to + 4, tmp); + _mm_storeu_si128((__m128i *)to + 5, tmp); + _mm_storeu_si128((__m128i *)to + 6, tmp); + _mm_storeu_si128((__m128i *)to + 7, tmp); + _mm_storeu_si128((__m128i *)to + 8, tmp); + _mm_storeu_si128((__m128i *)to + 9, tmp); + _mm_storeu_si128((__m128i *)to + 10, tmp); + _mm_storeu_si128((__m128i *)to + 11, tmp); + _mm_storeu_si128((__m128i *)to + 12, tmp); + _mm_storeu_si128((__m128i *)to + 13, tmp); + _mm_storeu_si128((__m128i *)to + 14, tmp); + _mm_storeu_si128((__m128i *)to + 15, tmp); + _mm_storeu_si128((__m128i *)to + 16, tmp); + _mm_storeu_si128((__m128i *)to + 17, tmp); + _mm_storeu_si128((__m128i *)to + 18, tmp); + _mm_storeu_si128((__m128i *)to + 19, tmp); + _mm_storeu_si128((__m128i *)to + 20, tmp); + _mm_storeu_si128((__m128i *)to + 21, tmp); + _mm_storeu_si128((__m128i *)to + 22, tmp); + _mm_storeu_si128((__m128i *)to + 23, tmp); + _mm_storeu_si128((__m128i *)to + 24, tmp); + _mm_storeu_si128((__m128i *)to + 25, tmp); + _mm_storeu_si128((__m128i *)to + 26, tmp); + _mm_storeu_si128((__m128i *)to + 27, tmp); + _mm_storeu_si128((__m128i *)to + 28, tmp); + _mm_storeu_si128((__m128i *)to + 29, tmp); + _mm_storeu_si128((__m128i *)to + 30, tmp); + _mm_storeu_si128((__m128i *)to + 31, tmp); + _mm_storeu_si128((__m128i *)to + 32, tmp); + _mm_storeu_si128((__m128i *)to + 33, tmp); + _mm_storeu_si128((__m128i *)to + 34, tmp); + _mm_storeu_si128((__m128i *)to + 35, tmp); + _mm_storeu_si128((__m128i *)to + 36, tmp); + _mm_storeu_si128((__m128i *)to + 37, tmp); + _mm_storeu_si128((__m128i *)to + 38, tmp); + _mm_storeu_si128((__m128i *)to + 39, tmp); + _mm_storeu_si128((__m128i *)to + 40, tmp); + _mm_storeu_si128((__m128i *)to + 41, tmp); + _mm_storeu_si128((__m128i *)to + 42, tmp); + _mm_storeu_si128((__m128i *)to + 43, tmp); + _mm_storeu_si128((__m128i *)to + 44, tmp); + _mm_storeu_si128((__m128i *)to + 45, tmp); + _mm_storeu_si128((__m128i *)to + 46, tmp); + _mm_storeu_si128((__m128i *)to + 47, tmp); + _mm_storeu_si128((__m128i *)to + 48, tmp); + _mm_storeu_si128((__m128i *)to + 49, tmp); + _mm_storeu_si128((__m128i *)to + 50, tmp); + _mm_storeu_si128((__m128i *)to + 51, tmp); + _mm_storeu_si128((__m128i *)to + 52, tmp); + _mm_storeu_si128((__m128i *)to + 53, tmp); + _mm_storeu_si128((__m128i *)to + 54, tmp); + _mm_storeu_si128((__m128i *)to + 55, tmp); + _mm_storeu_si128((__m128i *)to + 56, tmp); + _mm_storeu_si128((__m128i *)to + 57, tmp); + _mm_storeu_si128((__m128i *)to + 58, tmp); + _mm_storeu_si128((__m128i *)to + 59, tmp); + _mm_storeu_si128((__m128i *)to + 60, tmp); + _mm_storeu_si128((__m128i *)to + 61, tmp); + _mm_storeu_si128((__m128i *)to + 62, tmp); + _mm_storeu_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x08: +#ifdef NO_ZEROS + tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_storeu_si128((__m128i *)to, tmp); + _mm_storeu_si128((__m128i *)to + 1, tmp); + _mm_storeu_si128((__m128i *)to + 2, tmp); + _mm_storeu_si128((__m128i *)to + 3, tmp); + _mm_storeu_si128((__m128i *)to + 4, tmp); + _mm_storeu_si128((__m128i *)to + 5, tmp); + _mm_storeu_si128((__m128i *)to + 6, tmp); + _mm_storeu_si128((__m128i *)to + 7, tmp); + _mm_storeu_si128((__m128i *)to + 8, tmp); + _mm_storeu_si128((__m128i *)to + 9, tmp); + _mm_storeu_si128((__m128i *)to + 10, tmp); + _mm_storeu_si128((__m128i *)to + 11, tmp); + _mm_storeu_si128((__m128i *)to + 12, tmp); + _mm_storeu_si128((__m128i *)to + 13, tmp); + _mm_storeu_si128((__m128i *)to + 14, tmp); + _mm_storeu_si128((__m128i *)to + 15, tmp); + _mm_storeu_si128((__m128i *)to + 16, tmp); + _mm_storeu_si128((__m128i *)to + 17, tmp); + _mm_storeu_si128((__m128i *)to + 18, tmp); + _mm_storeu_si128((__m128i *)to + 19, tmp); + _mm_storeu_si128((__m128i *)to + 20, tmp); + _mm_storeu_si128((__m128i *)to + 21, tmp); + _mm_storeu_si128((__m128i *)to + 22, tmp); + _mm_storeu_si128((__m128i *)to + 23, tmp); + _mm_storeu_si128((__m128i *)to + 24, tmp); + _mm_storeu_si128((__m128i *)to + 25, tmp); + _mm_storeu_si128((__m128i *)to + 26, tmp); + _mm_storeu_si128((__m128i *)to + 27, tmp); + _mm_storeu_si128((__m128i *)to + 28, tmp); + _mm_storeu_si128((__m128i *)to + 29, tmp); + _mm_storeu_si128((__m128i *)to + 30, tmp); + _mm_storeu_si128((__m128i *)to + 31, tmp); + _mm_storeu_si128((__m128i *)to + 32, tmp); + _mm_storeu_si128((__m128i *)to + 33, tmp); + _mm_storeu_si128((__m128i *)to + 34, tmp); + _mm_storeu_si128((__m128i *)to + 35, tmp); + _mm_storeu_si128((__m128i *)to + 36, tmp); + _mm_storeu_si128((__m128i *)to + 37, tmp); + _mm_storeu_si128((__m128i *)to + 38, tmp); + _mm_storeu_si128((__m128i *)to + 39, tmp); + _mm_storeu_si128((__m128i *)to + 40, tmp); + _mm_storeu_si128((__m128i *)to + 41, tmp); + _mm_storeu_si128((__m128i *)to + 42, tmp); + _mm_storeu_si128((__m128i *)to + 43, tmp); + _mm_storeu_si128((__m128i *)to + 44, tmp); + _mm_storeu_si128((__m128i *)to + 45, tmp); + _mm_storeu_si128((__m128i *)to + 46, tmp); + _mm_storeu_si128((__m128i *)to + 47, tmp); + _mm_storeu_si128((__m128i *)to + 48, tmp); + _mm_storeu_si128((__m128i *)to + 49, tmp); + _mm_storeu_si128((__m128i *)to + 50, tmp); + _mm_storeu_si128((__m128i *)to + 51, tmp); + _mm_storeu_si128((__m128i *)to + 52, tmp); + _mm_storeu_si128((__m128i *)to + 53, tmp); + _mm_storeu_si128((__m128i *)to + 54, tmp); + _mm_storeu_si128((__m128i *)to + 55, tmp); + _mm_storeu_si128((__m128i *)to + 56, tmp); + _mm_storeu_si128((__m128i *)to + 57, tmp); + _mm_storeu_si128((__m128i *)to + 58, tmp); + _mm_storeu_si128((__m128i *)to + 59, tmp); + _mm_storeu_si128((__m128i *)to + 60, tmp); + _mm_storeu_si128((__m128i *)to + 61, tmp); + _mm_storeu_si128((__m128i *)to + 62, tmp); + _mm_storeu_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x09: +#ifdef NO_ZEROS + tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_storeu_si128((__m128i *)to, tmp); + _mm_storeu_si128((__m128i *)to + 1, tmp); + _mm_storeu_si128((__m128i *)to + 2, tmp); + _mm_storeu_si128((__m128i *)to + 3, tmp); + _mm_storeu_si128((__m128i *)to + 4, tmp); + _mm_storeu_si128((__m128i *)to + 5, tmp); + _mm_storeu_si128((__m128i *)to + 6, tmp); + _mm_storeu_si128((__m128i *)to + 7, tmp); + _mm_storeu_si128((__m128i *)to + 8, tmp); + _mm_storeu_si128((__m128i *)to + 9, tmp); + _mm_storeu_si128((__m128i *)to + 10, tmp); + _mm_storeu_si128((__m128i *)to + 11, tmp); + _mm_storeu_si128((__m128i *)to + 12, tmp); + _mm_storeu_si128((__m128i *)to + 13, tmp); + _mm_storeu_si128((__m128i *)to + 14, tmp); + _mm_storeu_si128((__m128i *)to + 15, tmp); + _mm_storeu_si128((__m128i *)to + 16, tmp); + _mm_storeu_si128((__m128i *)to + 17, tmp); + _mm_storeu_si128((__m128i *)to + 18, tmp); + _mm_storeu_si128((__m128i *)to + 19, tmp); + _mm_storeu_si128((__m128i *)to + 20, tmp); + _mm_storeu_si128((__m128i *)to + 21, tmp); + _mm_storeu_si128((__m128i *)to + 22, tmp); + _mm_storeu_si128((__m128i *)to + 23, tmp); + _mm_storeu_si128((__m128i *)to + 24, tmp); + _mm_storeu_si128((__m128i *)to + 25, tmp); + _mm_storeu_si128((__m128i *)to + 26, tmp); + _mm_storeu_si128((__m128i *)to + 27, tmp); + _mm_storeu_si128((__m128i *)to + 28, tmp); + _mm_storeu_si128((__m128i *)to + 29, tmp); + _mm_storeu_si128((__m128i *)to + 30, tmp); + _mm_storeu_si128((__m128i *)to + 31, tmp); + _mm_storeu_si128((__m128i *)to + 32, tmp); + _mm_storeu_si128((__m128i *)to + 33, tmp); + _mm_storeu_si128((__m128i *)to + 34, tmp); + _mm_storeu_si128((__m128i *)to + 35, tmp); + _mm_storeu_si128((__m128i *)to + 36, tmp); + _mm_storeu_si128((__m128i *)to + 37, tmp); + _mm_storeu_si128((__m128i *)to + 38, tmp); + _mm_storeu_si128((__m128i *)to + 39, tmp); + _mm_storeu_si128((__m128i *)to + 40, tmp); + _mm_storeu_si128((__m128i *)to + 41, tmp); + _mm_storeu_si128((__m128i *)to + 42, tmp); + _mm_storeu_si128((__m128i *)to + 43, tmp); + _mm_storeu_si128((__m128i *)to + 44, tmp); + _mm_storeu_si128((__m128i *)to + 45, tmp); + _mm_storeu_si128((__m128i *)to + 46, tmp); + _mm_storeu_si128((__m128i *)to + 47, tmp); + _mm_storeu_si128((__m128i *)to + 48, tmp); + _mm_storeu_si128((__m128i *)to + 49, tmp); + _mm_storeu_si128((__m128i *)to + 50, tmp); + _mm_storeu_si128((__m128i *)to + 51, tmp); + _mm_storeu_si128((__m128i *)to + 52, tmp); + _mm_storeu_si128((__m128i *)to + 53, tmp); + _mm_storeu_si128((__m128i *)to + 54, tmp); + _mm_storeu_si128((__m128i *)to + 55, tmp); + _mm_storeu_si128((__m128i *)to + 56, tmp); + _mm_storeu_si128((__m128i *)to + 57, tmp); + _mm_storeu_si128((__m128i *)to + 58, tmp); + _mm_storeu_si128((__m128i *)to + 59, tmp); + _mm_storeu_si128((__m128i *)to + 60, tmp); + _mm_storeu_si128((__m128i *)to + 61, tmp); + _mm_storeu_si128((__m128i *)to + 62, tmp); + _mm_storeu_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x0a: +#ifdef NO_ZEROS + tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_storeu_si128((__m128i *)to, tmp); + _mm_storeu_si128((__m128i *)to + 1, tmp); + _mm_storeu_si128((__m128i *)to + 2, tmp); + _mm_storeu_si128((__m128i *)to + 3, tmp); + _mm_storeu_si128((__m128i *)to + 4, tmp); + _mm_storeu_si128((__m128i *)to + 5, tmp); + _mm_storeu_si128((__m128i *)to + 6, tmp); + _mm_storeu_si128((__m128i *)to + 7, tmp); + _mm_storeu_si128((__m128i *)to + 8, tmp); + _mm_storeu_si128((__m128i *)to + 9, tmp); + _mm_storeu_si128((__m128i *)to + 10, tmp); + _mm_storeu_si128((__m128i *)to + 11, tmp); + _mm_storeu_si128((__m128i *)to + 12, tmp); + _mm_storeu_si128((__m128i *)to + 13, tmp); + _mm_storeu_si128((__m128i *)to + 14, tmp); + _mm_storeu_si128((__m128i *)to + 15, tmp); + _mm_storeu_si128((__m128i *)to + 16, tmp); + _mm_storeu_si128((__m128i *)to + 17, tmp); + _mm_storeu_si128((__m128i *)to + 18, tmp); + _mm_storeu_si128((__m128i *)to + 19, tmp); + _mm_storeu_si128((__m128i *)to + 20, tmp); + _mm_storeu_si128((__m128i *)to + 21, tmp); + _mm_storeu_si128((__m128i *)to + 22, tmp); + _mm_storeu_si128((__m128i *)to + 23, tmp); + _mm_storeu_si128((__m128i *)to + 24, tmp); + _mm_storeu_si128((__m128i *)to + 25, tmp); + _mm_storeu_si128((__m128i *)to + 26, tmp); + _mm_storeu_si128((__m128i *)to + 27, tmp); + _mm_storeu_si128((__m128i *)to + 28, tmp); + _mm_storeu_si128((__m128i *)to + 29, tmp); + _mm_storeu_si128((__m128i *)to + 30, tmp); + _mm_storeu_si128((__m128i *)to + 31, tmp); + _mm_storeu_si128((__m128i *)to + 32, tmp); + _mm_storeu_si128((__m128i *)to + 33, tmp); + _mm_storeu_si128((__m128i *)to + 34, tmp); + _mm_storeu_si128((__m128i *)to + 35, tmp); + _mm_storeu_si128((__m128i *)to + 36, tmp); + _mm_storeu_si128((__m128i *)to + 37, tmp); + _mm_storeu_si128((__m128i *)to + 38, tmp); + _mm_storeu_si128((__m128i *)to + 39, tmp); + _mm_storeu_si128((__m128i *)to + 40, tmp); + _mm_storeu_si128((__m128i *)to + 41, tmp); + _mm_storeu_si128((__m128i *)to + 42, tmp); + _mm_storeu_si128((__m128i *)to + 43, tmp); + _mm_storeu_si128((__m128i *)to + 44, tmp); + _mm_storeu_si128((__m128i *)to + 45, tmp); + _mm_storeu_si128((__m128i *)to + 46, tmp); + _mm_storeu_si128((__m128i *)to + 47, tmp); + _mm_storeu_si128((__m128i *)to + 48, tmp); + _mm_storeu_si128((__m128i *)to + 49, tmp); + _mm_storeu_si128((__m128i *)to + 50, tmp); + _mm_storeu_si128((__m128i *)to + 51, tmp); + _mm_storeu_si128((__m128i *)to + 52, tmp); + _mm_storeu_si128((__m128i *)to + 53, tmp); + _mm_storeu_si128((__m128i *)to + 54, tmp); + _mm_storeu_si128((__m128i *)to + 55, tmp); + _mm_storeu_si128((__m128i *)to + 56, tmp); + _mm_storeu_si128((__m128i *)to + 57, tmp); + _mm_storeu_si128((__m128i *)to + 58, tmp); + _mm_storeu_si128((__m128i *)to + 59, tmp); + _mm_storeu_si128((__m128i *)to + 60, tmp); + _mm_storeu_si128((__m128i *)to + 61, tmp); + _mm_storeu_si128((__m128i *)to + 62, tmp); + _mm_storeu_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x0b: +#ifdef NO_ZEROS + tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_storeu_si128((__m128i *)to, tmp); + _mm_storeu_si128((__m128i *)to + 1, tmp); + _mm_storeu_si128((__m128i *)to + 2, tmp); + _mm_storeu_si128((__m128i *)to + 3, tmp); + _mm_storeu_si128((__m128i *)to + 4, tmp); + _mm_storeu_si128((__m128i *)to + 5, tmp); + _mm_storeu_si128((__m128i *)to + 6, tmp); + _mm_storeu_si128((__m128i *)to + 7, tmp); + _mm_storeu_si128((__m128i *)to + 8, tmp); + _mm_storeu_si128((__m128i *)to + 9, tmp); + _mm_storeu_si128((__m128i *)to + 10, tmp); + _mm_storeu_si128((__m128i *)to + 11, tmp); + _mm_storeu_si128((__m128i *)to + 12, tmp); + _mm_storeu_si128((__m128i *)to + 13, tmp); + _mm_storeu_si128((__m128i *)to + 14, tmp); + _mm_storeu_si128((__m128i *)to + 15, tmp); + _mm_storeu_si128((__m128i *)to + 16, tmp); + _mm_storeu_si128((__m128i *)to + 17, tmp); + _mm_storeu_si128((__m128i *)to + 18, tmp); + _mm_storeu_si128((__m128i *)to + 19, tmp); + _mm_storeu_si128((__m128i *)to + 20, tmp); + _mm_storeu_si128((__m128i *)to + 21, tmp); + _mm_storeu_si128((__m128i *)to + 22, tmp); + _mm_storeu_si128((__m128i *)to + 23, tmp); + _mm_storeu_si128((__m128i *)to + 24, tmp); + _mm_storeu_si128((__m128i *)to + 25, tmp); + _mm_storeu_si128((__m128i *)to + 26, tmp); + _mm_storeu_si128((__m128i *)to + 27, tmp); + _mm_storeu_si128((__m128i *)to + 28, tmp); + _mm_storeu_si128((__m128i *)to + 29, tmp); + _mm_storeu_si128((__m128i *)to + 30, tmp); + _mm_storeu_si128((__m128i *)to + 31, tmp); + _mm_storeu_si128((__m128i *)to + 32, tmp); + _mm_storeu_si128((__m128i *)to + 33, tmp); + _mm_storeu_si128((__m128i *)to + 34, tmp); + _mm_storeu_si128((__m128i *)to + 35, tmp); + _mm_storeu_si128((__m128i *)to + 36, tmp); + _mm_storeu_si128((__m128i *)to + 37, tmp); + _mm_storeu_si128((__m128i *)to + 38, tmp); + _mm_storeu_si128((__m128i *)to + 39, tmp); + _mm_storeu_si128((__m128i *)to + 40, tmp); + _mm_storeu_si128((__m128i *)to + 41, tmp); + _mm_storeu_si128((__m128i *)to + 42, tmp); + _mm_storeu_si128((__m128i *)to + 43, tmp); + _mm_storeu_si128((__m128i *)to + 44, tmp); + _mm_storeu_si128((__m128i *)to + 45, tmp); + _mm_storeu_si128((__m128i *)to + 46, tmp); + _mm_storeu_si128((__m128i *)to + 47, tmp); + _mm_storeu_si128((__m128i *)to + 48, tmp); + _mm_storeu_si128((__m128i *)to + 49, tmp); + _mm_storeu_si128((__m128i *)to + 50, tmp); + _mm_storeu_si128((__m128i *)to + 51, tmp); + _mm_storeu_si128((__m128i *)to + 52, tmp); + _mm_storeu_si128((__m128i *)to + 53, tmp); + _mm_storeu_si128((__m128i *)to + 54, tmp); + _mm_storeu_si128((__m128i *)to + 55, tmp); + _mm_storeu_si128((__m128i *)to + 56, tmp); + _mm_storeu_si128((__m128i *)to + 57, tmp); + _mm_storeu_si128((__m128i *)to + 58, tmp); + _mm_storeu_si128((__m128i *)to + 59, tmp); + _mm_storeu_si128((__m128i *)to + 60, tmp); + _mm_storeu_si128((__m128i *)to + 61, tmp); + _mm_storeu_si128((__m128i *)to + 62, tmp); + _mm_storeu_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x0c: +#ifdef NO_ZEROS + tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_storeu_si128((__m128i *)to, tmp); + _mm_storeu_si128((__m128i *)to + 1, tmp); + _mm_storeu_si128((__m128i *)to + 2, tmp); + _mm_storeu_si128((__m128i *)to + 3, tmp); + _mm_storeu_si128((__m128i *)to + 4, tmp); + _mm_storeu_si128((__m128i *)to + 5, tmp); + _mm_storeu_si128((__m128i *)to + 6, tmp); + _mm_storeu_si128((__m128i *)to + 7, tmp); + _mm_storeu_si128((__m128i *)to + 8, tmp); + _mm_storeu_si128((__m128i *)to + 9, tmp); + _mm_storeu_si128((__m128i *)to + 10, tmp); + _mm_storeu_si128((__m128i *)to + 11, tmp); + _mm_storeu_si128((__m128i *)to + 12, tmp); + _mm_storeu_si128((__m128i *)to + 13, tmp); + _mm_storeu_si128((__m128i *)to + 14, tmp); + _mm_storeu_si128((__m128i *)to + 15, tmp); + _mm_storeu_si128((__m128i *)to + 16, tmp); + _mm_storeu_si128((__m128i *)to + 17, tmp); + _mm_storeu_si128((__m128i *)to + 18, tmp); + _mm_storeu_si128((__m128i *)to + 19, tmp); + _mm_storeu_si128((__m128i *)to + 20, tmp); + _mm_storeu_si128((__m128i *)to + 21, tmp); + _mm_storeu_si128((__m128i *)to + 22, tmp); + _mm_storeu_si128((__m128i *)to + 23, tmp); + _mm_storeu_si128((__m128i *)to + 24, tmp); + _mm_storeu_si128((__m128i *)to + 25, tmp); + _mm_storeu_si128((__m128i *)to + 26, tmp); + _mm_storeu_si128((__m128i *)to + 27, tmp); + _mm_storeu_si128((__m128i *)to + 28, tmp); + _mm_storeu_si128((__m128i *)to + 29, tmp); + _mm_storeu_si128((__m128i *)to + 30, tmp); + _mm_storeu_si128((__m128i *)to + 31, tmp); + _mm_storeu_si128((__m128i *)to + 32, tmp); + _mm_storeu_si128((__m128i *)to + 33, tmp); + _mm_storeu_si128((__m128i *)to + 34, tmp); + _mm_storeu_si128((__m128i *)to + 35, tmp); + _mm_storeu_si128((__m128i *)to + 36, tmp); + _mm_storeu_si128((__m128i *)to + 37, tmp); + _mm_storeu_si128((__m128i *)to + 38, tmp); + _mm_storeu_si128((__m128i *)to + 39, tmp); + _mm_storeu_si128((__m128i *)to + 40, tmp); + _mm_storeu_si128((__m128i *)to + 41, tmp); + _mm_storeu_si128((__m128i *)to + 42, tmp); + _mm_storeu_si128((__m128i *)to + 43, tmp); + _mm_storeu_si128((__m128i *)to + 44, tmp); + _mm_storeu_si128((__m128i *)to + 45, tmp); + _mm_storeu_si128((__m128i *)to + 46, tmp); + _mm_storeu_si128((__m128i *)to + 47, tmp); + _mm_storeu_si128((__m128i *)to + 48, tmp); + _mm_storeu_si128((__m128i *)to + 49, tmp); + _mm_storeu_si128((__m128i *)to + 50, tmp); + _mm_storeu_si128((__m128i *)to + 51, tmp); + _mm_storeu_si128((__m128i *)to + 52, tmp); + _mm_storeu_si128((__m128i *)to + 53, tmp); + _mm_storeu_si128((__m128i *)to + 54, tmp); + _mm_storeu_si128((__m128i *)to + 55, tmp); + _mm_storeu_si128((__m128i *)to + 56, tmp); + _mm_storeu_si128((__m128i *)to + 57, tmp); + _mm_storeu_si128((__m128i *)to + 58, tmp); + _mm_storeu_si128((__m128i *)to + 59, tmp); + _mm_storeu_si128((__m128i *)to + 60, tmp); + _mm_storeu_si128((__m128i *)to + 61, tmp); + _mm_storeu_si128((__m128i *)to + 62, tmp); + _mm_storeu_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x0d: +#ifdef NO_ZEROS + tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_storeu_si128((__m128i *)to, tmp); + _mm_storeu_si128((__m128i *)to + 1, tmp); + _mm_storeu_si128((__m128i *)to + 2, tmp); + _mm_storeu_si128((__m128i *)to + 3, tmp); + _mm_storeu_si128((__m128i *)to + 4, tmp); + _mm_storeu_si128((__m128i *)to + 5, tmp); + _mm_storeu_si128((__m128i *)to + 6, tmp); + _mm_storeu_si128((__m128i *)to + 7, tmp); + _mm_storeu_si128((__m128i *)to + 8, tmp); + _mm_storeu_si128((__m128i *)to + 9, tmp); + _mm_storeu_si128((__m128i *)to + 10, tmp); + _mm_storeu_si128((__m128i *)to + 11, tmp); + _mm_storeu_si128((__m128i *)to + 12, tmp); + _mm_storeu_si128((__m128i *)to + 13, tmp); + _mm_storeu_si128((__m128i *)to + 14, tmp); + _mm_storeu_si128((__m128i *)to + 15, tmp); + _mm_storeu_si128((__m128i *)to + 16, tmp); + _mm_storeu_si128((__m128i *)to + 17, tmp); + _mm_storeu_si128((__m128i *)to + 18, tmp); + _mm_storeu_si128((__m128i *)to + 19, tmp); + _mm_storeu_si128((__m128i *)to + 20, tmp); + _mm_storeu_si128((__m128i *)to + 21, tmp); + _mm_storeu_si128((__m128i *)to + 22, tmp); + _mm_storeu_si128((__m128i *)to + 23, tmp); + _mm_storeu_si128((__m128i *)to + 24, tmp); + _mm_storeu_si128((__m128i *)to + 25, tmp); + _mm_storeu_si128((__m128i *)to + 26, tmp); + _mm_storeu_si128((__m128i *)to + 27, tmp); + _mm_storeu_si128((__m128i *)to + 28, tmp); + _mm_storeu_si128((__m128i *)to + 29, tmp); + _mm_storeu_si128((__m128i *)to + 30, tmp); + _mm_storeu_si128((__m128i *)to + 31, tmp); + _mm_storeu_si128((__m128i *)to + 32, tmp); + _mm_storeu_si128((__m128i *)to + 33, tmp); + _mm_storeu_si128((__m128i *)to + 34, tmp); + _mm_storeu_si128((__m128i *)to + 35, tmp); + _mm_storeu_si128((__m128i *)to + 36, tmp); + _mm_storeu_si128((__m128i *)to + 37, tmp); + _mm_storeu_si128((__m128i *)to + 38, tmp); + _mm_storeu_si128((__m128i *)to + 39, tmp); + _mm_storeu_si128((__m128i *)to + 40, tmp); + _mm_storeu_si128((__m128i *)to + 41, tmp); + _mm_storeu_si128((__m128i *)to + 42, tmp); + _mm_storeu_si128((__m128i *)to + 43, tmp); + _mm_storeu_si128((__m128i *)to + 44, tmp); + _mm_storeu_si128((__m128i *)to + 45, tmp); + _mm_storeu_si128((__m128i *)to + 46, tmp); + _mm_storeu_si128((__m128i *)to + 47, tmp); + _mm_storeu_si128((__m128i *)to + 48, tmp); + _mm_storeu_si128((__m128i *)to + 49, tmp); + _mm_storeu_si128((__m128i *)to + 50, tmp); + _mm_storeu_si128((__m128i *)to + 51, tmp); + _mm_storeu_si128((__m128i *)to + 52, tmp); + _mm_storeu_si128((__m128i *)to + 53, tmp); + _mm_storeu_si128((__m128i *)to + 54, tmp); + _mm_storeu_si128((__m128i *)to + 55, tmp); + _mm_storeu_si128((__m128i *)to + 56, tmp); + _mm_storeu_si128((__m128i *)to + 57, tmp); + _mm_storeu_si128((__m128i *)to + 58, tmp); + _mm_storeu_si128((__m128i *)to + 59, tmp); + _mm_storeu_si128((__m128i *)to + 60, tmp); + _mm_storeu_si128((__m128i *)to + 61, tmp); + _mm_storeu_si128((__m128i *)to + 62, tmp); + _mm_storeu_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x0e: +#ifdef NO_ZEROS + tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_storeu_si128((__m128i *)to, tmp); + _mm_storeu_si128((__m128i *)to + 1, tmp); + _mm_storeu_si128((__m128i *)to + 2, tmp); + _mm_storeu_si128((__m128i *)to + 3, tmp); + _mm_storeu_si128((__m128i *)to + 4, tmp); + _mm_storeu_si128((__m128i *)to + 5, tmp); + _mm_storeu_si128((__m128i *)to + 6, tmp); + _mm_storeu_si128((__m128i *)to + 7, tmp); + _mm_storeu_si128((__m128i *)to + 8, tmp); + _mm_storeu_si128((__m128i *)to + 9, tmp); + _mm_storeu_si128((__m128i *)to + 10, tmp); + _mm_storeu_si128((__m128i *)to + 11, tmp); + _mm_storeu_si128((__m128i *)to + 12, tmp); + _mm_storeu_si128((__m128i *)to + 13, tmp); + _mm_storeu_si128((__m128i *)to + 14, tmp); + _mm_storeu_si128((__m128i *)to + 15, tmp); + _mm_storeu_si128((__m128i *)to + 16, tmp); + _mm_storeu_si128((__m128i *)to + 17, tmp); + _mm_storeu_si128((__m128i *)to + 18, tmp); + _mm_storeu_si128((__m128i *)to + 19, tmp); + _mm_storeu_si128((__m128i *)to + 20, tmp); + _mm_storeu_si128((__m128i *)to + 21, tmp); + _mm_storeu_si128((__m128i *)to + 22, tmp); + _mm_storeu_si128((__m128i *)to + 23, tmp); + _mm_storeu_si128((__m128i *)to + 24, tmp); + _mm_storeu_si128((__m128i *)to + 25, tmp); + _mm_storeu_si128((__m128i *)to + 26, tmp); + _mm_storeu_si128((__m128i *)to + 27, tmp); + _mm_storeu_si128((__m128i *)to + 28, tmp); + _mm_storeu_si128((__m128i *)to + 29, tmp); + _mm_storeu_si128((__m128i *)to + 30, tmp); + _mm_storeu_si128((__m128i *)to + 31, tmp); + _mm_storeu_si128((__m128i *)to + 32, tmp); + _mm_storeu_si128((__m128i *)to + 33, tmp); + _mm_storeu_si128((__m128i *)to + 34, tmp); + _mm_storeu_si128((__m128i *)to + 35, tmp); + _mm_storeu_si128((__m128i *)to + 36, tmp); + _mm_storeu_si128((__m128i *)to + 37, tmp); + _mm_storeu_si128((__m128i *)to + 38, tmp); + _mm_storeu_si128((__m128i *)to + 39, tmp); + _mm_storeu_si128((__m128i *)to + 40, tmp); + _mm_storeu_si128((__m128i *)to + 41, tmp); + _mm_storeu_si128((__m128i *)to + 42, tmp); + _mm_storeu_si128((__m128i *)to + 43, tmp); + _mm_storeu_si128((__m128i *)to + 44, tmp); + _mm_storeu_si128((__m128i *)to + 45, tmp); + _mm_storeu_si128((__m128i *)to + 46, tmp); + _mm_storeu_si128((__m128i *)to + 47, tmp); + _mm_storeu_si128((__m128i *)to + 48, tmp); + _mm_storeu_si128((__m128i *)to + 49, tmp); + _mm_storeu_si128((__m128i *)to + 50, tmp); + _mm_storeu_si128((__m128i *)to + 51, tmp); + _mm_storeu_si128((__m128i *)to + 52, tmp); + _mm_storeu_si128((__m128i *)to + 53, tmp); + _mm_storeu_si128((__m128i *)to + 54, tmp); + _mm_storeu_si128((__m128i *)to + 55, tmp); + _mm_storeu_si128((__m128i *)to + 56, tmp); + _mm_storeu_si128((__m128i *)to + 57, tmp); + _mm_storeu_si128((__m128i *)to + 58, tmp); + _mm_storeu_si128((__m128i *)to + 59, tmp); + _mm_storeu_si128((__m128i *)to + 60, tmp); + _mm_storeu_si128((__m128i *)to + 61, tmp); + _mm_storeu_si128((__m128i *)to + 62, tmp); + _mm_storeu_si128((__m128i *)to + 63, tmp); + to += 256; + case 0x0f: +#ifdef NO_ZEROS + tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + tmp = _mm_castps_si128(_mm_xor_ps(_mm_cvtepu8_epi32(tmp), _mm_cvtepu8_epi32(tmp))); +#endif + _mm_storeu_si128((__m128i *)to, tmp); + _mm_storeu_si128((__m128i *)to + 1, tmp); + _mm_storeu_si128((__m128i *)to + 2, tmp); + _mm_storeu_si128((__m128i *)to + 3, tmp); + _mm_storeu_si128((__m128i *)to + 4, tmp); + _mm_storeu_si128((__m128i *)to + 5, tmp); + _mm_storeu_si128((__m128i *)to + 6, tmp); + _mm_storeu_si128((__m128i *)to + 7, tmp); + _mm_storeu_si128((__m128i *)to + 8, tmp); + _mm_storeu_si128((__m128i *)to + 9, tmp); + _mm_storeu_si128((__m128i *)to + 10, tmp); + _mm_storeu_si128((__m128i *)to + 11, tmp); + _mm_storeu_si128((__m128i *)to + 12, tmp); + _mm_storeu_si128((__m128i *)to + 13, tmp); + _mm_storeu_si128((__m128i *)to + 14, tmp); + _mm_storeu_si128((__m128i *)to + 15, tmp); + _mm_storeu_si128((__m128i *)to + 16, tmp); + _mm_storeu_si128((__m128i *)to + 17, tmp); + _mm_storeu_si128((__m128i *)to + 18, tmp); + _mm_storeu_si128((__m128i *)to + 19, tmp); + _mm_storeu_si128((__m128i *)to + 20, tmp); + _mm_storeu_si128((__m128i *)to + 21, tmp); + _mm_storeu_si128((__m128i *)to + 22, tmp); + _mm_storeu_si128((__m128i *)to + 23, tmp); + _mm_storeu_si128((__m128i *)to + 24, tmp); + _mm_storeu_si128((__m128i *)to + 25, tmp); + _mm_storeu_si128((__m128i *)to + 26, tmp); + _mm_storeu_si128((__m128i *)to + 27, tmp); + _mm_storeu_si128((__m128i *)to + 28, tmp); + _mm_storeu_si128((__m128i *)to + 29, tmp); + _mm_storeu_si128((__m128i *)to + 30, tmp); + _mm_storeu_si128((__m128i *)to + 31, tmp); + _mm_storeu_si128((__m128i *)to + 32, tmp); + _mm_storeu_si128((__m128i *)to + 33, tmp); + _mm_storeu_si128((__m128i *)to + 34, tmp); + _mm_storeu_si128((__m128i *)to + 35, tmp); + _mm_storeu_si128((__m128i *)to + 36, tmp); + _mm_storeu_si128((__m128i *)to + 37, tmp); + _mm_storeu_si128((__m128i *)to + 38, tmp); + _mm_storeu_si128((__m128i *)to + 39, tmp); + _mm_storeu_si128((__m128i *)to + 40, tmp); + _mm_storeu_si128((__m128i *)to + 41, tmp); + _mm_storeu_si128((__m128i *)to + 42, tmp); + _mm_storeu_si128((__m128i *)to + 43, tmp); + _mm_storeu_si128((__m128i *)to + 44, tmp); + _mm_storeu_si128((__m128i *)to + 45, tmp); + _mm_storeu_si128((__m128i *)to + 46, tmp); + _mm_storeu_si128((__m128i *)to + 47, tmp); + _mm_storeu_si128((__m128i *)to + 48, tmp); + _mm_storeu_si128((__m128i *)to + 49, tmp); + _mm_storeu_si128((__m128i *)to + 50, tmp); + _mm_storeu_si128((__m128i *)to + 51, tmp); + _mm_storeu_si128((__m128i *)to + 52, tmp); + _mm_storeu_si128((__m128i *)to + 53, tmp); + _mm_storeu_si128((__m128i *)to + 54, tmp); + _mm_storeu_si128((__m128i *)to + 55, tmp); + _mm_storeu_si128((__m128i *)to + 56, tmp); + _mm_storeu_si128((__m128i *)to + 57, tmp); + _mm_storeu_si128((__m128i *)to + 58, tmp); + _mm_storeu_si128((__m128i *)to + 59, tmp); + _mm_storeu_si128((__m128i *)to + 60, tmp); + _mm_storeu_si128((__m128i *)to + 61, tmp); + _mm_storeu_si128((__m128i *)to + 62, tmp); + _mm_storeu_si128((__m128i *)to + 63, tmp); + to += 256; + break; + case 0x10: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x11: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x12: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x13: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x14: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x15: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x16: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x17: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x18: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x19: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x1a: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x1b: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x1c: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x1d: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x1e: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + case 0x1f: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 17, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 19, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 22, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 23, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 26, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 29, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_1)); + byte_stream = _mm_srli_epi64(byte_stream, 1); + _mm_storeu_si128((__m128i *)to + 31, _mm_and_si128(byte_stream, mask_1)); + in += 16; + to += 128; + break; + case 0x20: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x21: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x22: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x23: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x24: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x25: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x26: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x27: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x28: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x29: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x2a: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x2b: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x2c: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x2d: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x2e: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + case 0x2f: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 11, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 13, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_2)); + byte_stream = _mm_srli_epi64(byte_stream, 2); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_2)); + in += 16; + to += 64; + break; + case 0x30: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x31: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x32: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x33: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x34: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x35: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x36: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x37: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x38: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x39: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x3a: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x3b: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x3c: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x3d: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x3e: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + case 0x3f: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_3)); + byte_stream = _mm_srli_epi64(byte_stream, 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_3)); + in += 16; + to += 40; + break; + case 0x40: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x41: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x42: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x43: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x44: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x45: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x46: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x47: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x48: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x49: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x4a: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x4b: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x4c: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x4d: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x4e: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + case 0x4f: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_4)); + byte_stream = _mm_srli_epi64(byte_stream, 4); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_4)); + in += 16; + to += 32; + break; + case 0x50: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x51: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x52: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x53: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x54: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x55: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x56: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x57: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x58: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x59: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x5a: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x5b: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x5c: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x5d: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x5e: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + case 0x5f: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_5)); + byte_stream = _mm_srli_epi64(byte_stream, 5); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_5)); + in += 16; + to += 24; + break; + case 0x60: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x61: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x62: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x63: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x64: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x65: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x66: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x67: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x68: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x69: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x6a: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x6b: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x6c: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x6d: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x6e: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + case 0x6f: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_6)); + byte_stream = _mm_srli_epi64(byte_stream, 6); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_6)); + in += 16; + to += 20; + break; + case 0x70: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x71: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x72: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x73: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x74: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x75: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x76: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x77: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x78: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x79: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x7a: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x7b: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x7c: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x7d: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x7e: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + case 0x7f: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_7)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 7)), mask_7)); + byte_stream = _mm_srli_epi32(byte_stream_2, 3); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_7)); + byte_stream = _mm_srli_epi32(byte_stream, 7); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_7)); + in += 32; + to += 36; + break; + case 0x80: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x81: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x82: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x83: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x84: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x85: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x86: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x87: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x88: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x89: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x8a: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x8b: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x8c: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x8d: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x8e: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + case 0x8f: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu8_epi32(tmp2)); + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu8_epi32(tmp)); + tmp2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)); + _mm_storeu_si128((__m128i *)to + 3, _mm_cvtepu8_epi32(tmp2)); + in += 16; + to += 16; + break; + case 0x90: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x91: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x92: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x93: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x94: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x95: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x96: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x97: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x98: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x99: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x9a: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x9b: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x9c: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x9d: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x9e: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + case 0x9f: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_9)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 9)), mask_9)); + byte_stream = _mm_srli_epi32(byte_stream_2, 4); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_9)); + byte_stream = _mm_srli_epi32(byte_stream, 9); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_9)); + in += 32; + to += 28; + break; + case 0xa0: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xa1: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xa2: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xa3: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xa4: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xa5: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xa6: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xa7: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xa8: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xa9: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xaa: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xab: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xac: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xad: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xae: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + case 0xaf: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_10)); + byte_stream = _mm_srli_epi64(byte_stream, 10); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(byte_stream, mask_10)); + in += 16; + to += 12; + break; + case 0xb0: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xb1: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xb2: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xb3: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xb4: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xb5: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xb6: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xb7: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xb8: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xb9: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xba: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xbb: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xbc: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xbd: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xbe: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + case 0xbf: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(byte_stream, mask_12)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 12)), mask_12)); + byte_stream = _mm_srli_epi32(byte_stream_2, 8); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_12)); + byte_stream = _mm_srli_epi32(byte_stream, 12); + _mm_storeu_si128((__m128i *)to + 4, _mm_and_si128(byte_stream, mask_12)); + in += 32; + to += 20; + break; + case 0xc0: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xc1: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xc2: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xc3: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xc4: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xc5: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xc6: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xc7: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xc8: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xc9: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xca: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xcb: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xcc: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xcd: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xce: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + case 0xcf: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); + in += 16; + to += 8; + break; + case 0xd0: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); + in += 32; + to += 12; + case 0xd1: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); + in += 32; + to += 12; + case 0xd2: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); + in += 32; + to += 12; + case 0xd3: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); + in += 32; + to += 12; + case 0xd4: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); + in += 32; + to += 12; + case 0xd5: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); + in += 32; + to += 12; + case 0xd6: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); + in += 32; + to += 12; + case 0xd7: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); + in += 32; + to += 12; + case 0xd8: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); + in += 32; + to += 12; + case 0xd9: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); + in += 32; + to += 12; + case 0xda: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); + in += 32; + to += 12; + case 0xdb: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); + in += 32; + to += 12; + case 0xdc: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); + in += 32; + to += 12; + case 0xdd: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); + in += 32; + to += 12; + case 0xde: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); + in += 32; + to += 12; + case 0xdf: + byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, _mm_and_si128(byte_stream, mask_21)); + byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); + in += 32; + to += 12; + break; + case 0xe0: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xe1: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xe2: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xe3: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xe4: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xe5: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xe6: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xe7: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xe8: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xe9: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xea: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xeb: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xec: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xed: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xee: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, tmp); + in += 16; + to += 4; + case 0xef: + tmp = _mm_loadu_si128((__m128i *)in); + _mm_storeu_si128((__m128i *)to, tmp); + in += 16; + to += 4; + break; + case 0xf0: + *to = *(uint8_t *)in; + in += 1; + to += 1; + case 0xf1: + *to = *(uint8_t *)in; + in += 1; + to += 1; + case 0xf2: + *to = *(uint8_t *)in; + in += 1; + to += 1; + case 0xf3: + *to = *(uint8_t *)in; + in += 1; + to += 1; + break; + case 0xf4: + *to = *(uint16_t *)in; + in += 2; + to += 1; + case 0xf5: + *to = *(uint16_t *)in; + in += 2; + to += 1; + case 0xf6: + *to = *(uint16_t *)in; + in += 2; + to += 1; + case 0xf7: + *to = *(uint16_t *)in; + in += 2; + to += 1; + break; + case 0xf8: + *to = (*(uint8_t *)in << 16) | (*(uint8_t *)(in + 1) << 8) | (*(uint8_t *)(in + 2)); + in += 3; + to += 1; + case 0xf9: + *to = (*(uint8_t *)in << 16) | (*(uint8_t *)(in + 1) << 8) | (*(uint8_t *)(in + 2)); + in += 3; + to += 1; + case 0xfa: + *to = (*(uint8_t *)in << 16) | (*(uint8_t *)(in + 1) << 8) | (*(uint8_t *)(in + 2)); + in += 3; + to += 1; + case 0xfb: + *to = (*(uint8_t *)in << 16) | (*(uint8_t *)(in + 1) << 8) | (*(uint8_t *)(in + 2)); + in += 3; + to += 1; + break; + case 0xfc: + *to = *(uint32_t *)in; + in += 4; + to += 1; + case 0xfd: + *to = *(uint32_t *)in; + in += 4; + to += 1; + case 0xfe: + *to = *(uint32_t *)in; + in += 4; + to += 1; + case 0xff: + *to = *(uint32_t *)in; + in += 4; + to += 1; + break; + break; + } + } +} diff --git a/ext/bench_/bench/compress_qmx_v3.cpp b/ext/bench_/bench/compress_qmx_v3.cpp new file mode 100644 index 0000000..bb88945 --- /dev/null +++ b/ext/bench_/bench/compress_qmx_v3.cpp @@ -0,0 +1,1510 @@ +/* + ANT_COMPRESS_QMX_V3.CPP + ------------------- + Copyright (c) 2014 by Andrew Trotman + Licensed BSD + + A version of BinPacking where we pack into a 128-bit SSE register the following: + 256 0-bit words + 128 1-bit words + 64 2-bit words + 40 3-bit words + 32 4-bit words + 24 5-bit words + 20 6-bit words + 16 8-bit words + 12 10-bit words + 8 16-bit words + 4 32-bit words + or pack into two 128-bit words (i.e. 256 bits) the following: + 36 7-bit words + 28 9-bit words + 20 12-bit words + 12 21-bit words + + This gives us 15 possible combinations. The combinaton is stored in the top 4 bits of a selector byte. The + bottom 4-bits of the selector store a run-length (the number of such sequences seen in a row. + + The 128-bit (or 256-bit) packed binary values are stored first. Then we store the selectors, Finally, + stored variable byte encoded, is a pointer to the start of the selector (from the end of the sequence). + + This way, all reads and writes are 128-bit word aligned, except addressing the selector (and the pointer + the selector). These reads are byte aligned. + + Note: There is currently 1 unused encoding (i.e. 16 unused selecvtor values). These might in the future be + used for encoding exceptions, much as PForDelta does. +*/ +#include +#include +#include +#include +#include +#include "compress_qmx_v3.h" + +//#define MAKE_DECOMPRESS 1 /* uncomment this and it will create a program that writes the decompressor */ +//#define TEST_ONE_STRING 1 /* Uncomment this and it will create a program that can be used to test the compressor and decompressor */ +#define NO_ZEROS 1 /* stores runs of 256 1s in a row (not 1-bit number, but actual 1 values). */ +#define SHORT_END_BLOCKS 1 + +#ifdef _MSC_VER + #define ALIGN_16 __declspec(align(16)) +#else + #define ALIGN_16 __attribute__ ((aligned (16))) +#endif + +//#define STATS /* uncomment this and it will count the selector usage */ +#ifdef STATS + static uint32_t stats[65] = {0}; +#endif + +/* + ANT_ANT_COMPRESS_QMX_V3::ANT_ANT_COMPRESS_QMX_V3() + ------------------------------------ +*/ +ANT_compress_qmx_v3::ANT_compress_qmx_v3() +{ +length_buffer = NULL; +length_buffer_length = 0; +} + +/* + ANT_ANT_COMPRESS_QMX_V3::~ANT_ANT_COMPRESS_QMX_V3() + ------------------------------------- +*/ +ANT_compress_qmx_v3::~ANT_compress_qmx_v3() +{ +delete [] length_buffer; +#ifdef STATS + uint32_t which; + for (which = 0; which <= 32; which++) + if (stats[which] != 0) + printf("%d\t%d\ttimes\n", which, stats[which]); +#endif +} + +/* + BYTES_NEEDED_FOR() + ------------------ +*/ +static uint8_t bytes_needed_for(uint32_t value) +{ +if (value <= 0xFF) + return 1; +else if (value <= 0xFFFF) + return 2; +else if (value <= 0xFFFFFF) + return 3; +else + return 4; +} + +/* + BITS_NEEDED_FOR() + ----------------- +*/ +static uint8_t bits_needed_for(uint32_t value) +{ +if (value == 0x01) + return 0; +else if (value <= 0x01) + return 1; +else if (value <= 0x03) + return 2; +else if (value <= 0x07) + return 3; +else if (value <= 0x0F) + return 4; +else if (value <= 0x1F) + return 5; +else if (value <= 0x3F) + return 6; +else if (value <= 0x7F) + return 7; +else if (value <= 0xFF) + return 8; +else if (value <= 0x1FF) + return 9; +else if (value <= 0x3FF) + return 10; +else if (value <= 0xFFF) + return 12; +else if (value <= 0xFFFF) + return 16; +else if (value <= 0x1FFFFF) + return 21; +else + return 32; +} + + +/* + WRITE_OUT() + ----------- +*/ +static void write_out(uint8_t **buffer, uint32_t *source, uint32_t raw_count, uint32_t size_in_bits, uint8_t **length_buffer) +{ +uint32_t current, batch; +uint8_t *destination = *buffer; +uint32_t *end = source + raw_count; +uint8_t *key_store = *length_buffer; +uint32_t ALIGN_16 sequence_buffer[4]; +uint32_t instance, value; +uint8_t type; +uint32_t count; + +uint32_t max_bytes = 1; // this is the bytw-width for type128 encoded non-SSE integers + +#ifdef STATS + stats[size_in_bits] += raw_count; +#endif + +if (size_in_bits == 0) + { + type = 0; + count = (raw_count + 255) / 256; + } +else if (size_in_bits == 1) + { + type = 1; // 1 bit per integer + count = (raw_count + 127) / 128; + } +else if (size_in_bits == 2) + { + type = 2; // 2 bits per integer + count = (raw_count + 63) / 64; + } +else if (size_in_bits == 3) + { + type = 3; // 3 bits per integer + count = (raw_count + 39) / 40; + } +else if (size_in_bits == 4) + { + type = 4; // 4 bits per integer + count = (raw_count + 31) / 32; + } +else if (size_in_bits == 5) + { + type = 5; // 5 bits per integer + count = (raw_count + 23) / 24; + } +else if (size_in_bits == 6) + { + type = 6; // 6 bits per integer + count = (raw_count + 19) / 20; + } +else if (size_in_bits == 7) + { + type = 7; // 7 bits per integer, 18 integers per read (but requires 2 reads) + count = (raw_count + 35) / 36; + } +else if (size_in_bits == 8) + { + type = 8; // 8 bits per integer + count = (raw_count + 15) / 16; + } +else if (size_in_bits == 9) + { + type = 9; // 9 bits per integer, 14 integers per read (but requires 2 reads) + count = (raw_count + 27) / 28; + } +else if (size_in_bits == 10) + { + type = 10; // 10 bits per integer + count = (raw_count + 11) / 12; + } +else if (size_in_bits == 12) + { + type = 11; // 12 bits per integer, 10 integers per read (but requires 2 reads) + count = (raw_count + 19) / 20; + } +else if (size_in_bits == 16) + { + type = 12; // 16 bits per integer + count = (raw_count + 7) / 8; + } +else if (size_in_bits == 21) + { + type = 13; // 21 bits per integer, 6 integers per read (but requires 2 reads) + count = (raw_count + 11) / 12; + } +else if (size_in_bits == 32) + { + type = 14; // 32 bits per integer + count = (raw_count + 3) / 4; + } +else if (size_in_bits == 128) + { + type = 15; + count = raw_count; + /* + As the count for type 128 can only be 1, 2, or 3, we can re-appropriate it and store the bit-length in there too. + */ + max_bytes = 1; + for (uint32_t integer = 0; integer < count; integer++) + { + if (bytes_needed_for(source[integer]) > max_bytes) + max_bytes = bytes_needed_for(source[integer]); + } + } +else + exit(printf("Can't compress into integers of size %dbits\n", size_in_bits)); + +while (count > 0) + { + batch = count > 16 ? 16 : count; + *key_store++ = (type << 4) | (~(batch - 1) & 0x0F); + + count -= batch; + + for (current = 0; current < batch; current++) + { + switch (size_in_bits) + { + case 0: // 0 bits per integer (i.e. a long sequence of zeros) + /* + In this case we don't need to store a 4 byte integer because its implicit + */ + source += 256; + break; + case 1: // 1 bit per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 128; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 1); + + memcpy(destination, sequence_buffer, 16); + destination += 16; + source += 128; + break; + case 2: // 2 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 64; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 2); + + memcpy(destination, sequence_buffer, 16); + destination += 16; + source += 64; + break; + case 3: // 3 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 40; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 3); + + memcpy(destination, sequence_buffer, 16); + destination += 16; + source += 40; + break; + case 4: // 4 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 32; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 4); + + memcpy(destination, sequence_buffer, 16); + destination += 16; + source += 32; + break; + case 5: // 5 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 24; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 5); + + memcpy(destination, sequence_buffer, 16); + destination += 16; + source += 24; + break; + case 6: // 6 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 20; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 6); + memcpy(destination, sequence_buffer, 16); + destination += 16; + source += 20; + break; + case 7: // 7 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 20; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 7); + memcpy(destination, sequence_buffer, 16); + destination += 16; + + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 16; value < 20; value++) + sequence_buffer[value & 0x03] |= source[value] >> 4; + for (value = 20; value < 36; value++) + sequence_buffer[value & 0x03] |= source[value] << (((value - 20) / 4) * 7 + 3); + memcpy(destination, sequence_buffer, 16); + + destination += 16; + source += 36; // 36 in a double 128-bit word + break; + case 8: // 8 bits per integer +#ifdef SHORT_END_BLOCKS + for (instance = 0; instance < 16 && source < end; instance++) +#else + for (instance = 0; instance < 16; instance++) +#endif + *destination++ = (uint8_t)*source++; + break; + case 9: // 9 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 16; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 9); + memcpy(destination, sequence_buffer, 16); + destination += 16; + + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 12; value < 16; value++) + sequence_buffer[value & 0x03] |= source[value] >> 5; + for (value = 16; value < 28; value++) + sequence_buffer[value & 0x03] |= source[value] << (((value - 16) / 4) * 9 + 4); + memcpy(destination, sequence_buffer, 16); + + destination += 16; + source += 28; // 28 in a double 128-bit word + break; + case 10: // 10 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 12; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 10); + + memcpy(destination, sequence_buffer, 16); + destination += 16; + source += 12; + break; + case 12: // 12 bit integers + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 12; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 12); + memcpy(destination, sequence_buffer, 16); + destination += 16; + + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 8; value < 12; value++) + sequence_buffer[value & 0x03] |= source[value] >> 8; + for (value = 12; value < 20; value++) + sequence_buffer[value & 0x03] |= source[value] << (((value - 12) / 4) * 12 + 8); + memcpy(destination, sequence_buffer, 16); + + destination += 16; + source += 20; // 20 in a double 128-bit word + break; + case 16: // 16 bits per integer +#ifdef SHORT_END_BLOCKS + for (instance = 0; instance < 8 && source < end; instance++) +#else + for (instance = 0; instance < 8; instance++) +#endif + { + *(uint16_t *)destination = (uint16_t)*source++; + destination += 2; + } + break; + case 21: // 21 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 8; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 21); + memcpy(destination, sequence_buffer, 16); + destination += 16; + + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 4; value < 8; value++) + sequence_buffer[value & 0x03] |= source[value] >> 11; + for (value = 8; value < 12; value++) + sequence_buffer[value & 0x03] |= source[value] << (((value - 8) / 4) * 21 + 11); + memcpy(destination, sequence_buffer, 16); + + destination += 16; + source += 12; // 12 in a double 128-bit word + break; + case 32: // 32 bits per integer +#ifdef SHORT_END_BLOCKS + for (instance = 0; instance < 4 && source < end; instance++) +#else + for (instance = 0; instance < 4; instance++) +#endif + { + *(uint32_t *)destination = (uint32_t)*source++; + destination += 4; + } + break; + case 128: + if (max_bytes == 1) + { + *(uint8_t *)destination = (uint8_t)*source; + source++; + destination += 1; + *(key_store - 1) = (type << 4) | (~(batch - 1) & 0x03); + } + else if (max_bytes == 2) + { + *(uint16_t *)destination = (uint16_t)*source; + source++; + destination += 2; + *(key_store - 1) = (type << 4) | 4 | (~(batch - 1) & 0x03); + } + else if (max_bytes == 3) + { + *destination++ = (uint8_t)((*source >> 16) & 0xFF); + *destination++ = (uint8_t)((*source >> 8) & 0xFF); + *destination++ = (uint8_t)((*source >> 0) & 0xFF); + source++; + + *(key_store - 1) = (type << 4) | 8 | (~(batch - 1) & 0x03); + } + else if (max_bytes == 4) + { + *(uint32_t *)destination = (uint32_t)*source; + source++; + destination += 4; + *(key_store - 1) = (type << 4) | 0x0C | (~(batch - 1) & 0x03); + } + else + printf("max_bytes must be 1, 2, 3, or 4, but is:%d", (int)max_bytes); + break; + } + } + } +*buffer = destination; +*length_buffer = key_store; +} + +/* + MAX() + ----- +*/ +template +T max(T a, T b) +{ +return a > b ? a : b; +} + +/* + MAX() + ----- +*/ +template +T max(T a, T b, T c, T d) +{ +return max(max(a, b), max(c, d)); +} + +/* + ANT_ANT_COMPRESS_QMX_V3::ENCODEARRAY() + ------------------------------- +*/ +void ANT_compress_qmx_v3::encodeArray(const uint32_t *source, uint64_t source_integers, uint32_t *into, uint64_t *nvalue) +{ +const uint32_t WASTAGE = 512; +uint8_t *current_length, *destination = (uint8_t *)into, *keys; +uint32_t *current, run_length, bits, new_needed, wastage; +uint32_t block, largest; + +/* + make sure we have enough room to store the lengths +*/ +if (length_buffer_length < source_integers) + { + delete [] length_buffer; + length_buffer = new uint8_t [(size_t)(length_buffer_length = source_integers) + WASTAGE]; + } + +/* + Get the lengths of the integers +*/ +current_length = length_buffer; +for (current = (uint32_t *)source; current < source + source_integers; current++) + *current_length++ = bits_needed_for(*current); + +/* + Shove a bunch of 0 length integers on the end to allow for overflow +*/ +for (wastage = 0; wastage < WASTAGE; wastage++) + *current_length++ = 0; + +/* + Process the lengths. To maximise SSE throughput we need each write to be 128-bit (4*32-bit) alignned + and therefore we need each compress "block" to be the same size where a compress "block" is a set of + four encoded integers starting on a 4-integer boundary. +*/ +for (current_length = length_buffer; current_length < length_buffer + source_integers + 4; current_length += 4) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = max(*current_length, *(current_length + 1), *(current_length + 2), *(current_length + 3)); + +/* + This code makes sure we can do aligned reads, promoting to larger integers if necessary +*/ +current_length = length_buffer; +while (current_length < length_buffer + source_integers) + { +#ifdef SHORT_END_BLOCKS + /* + If there are fewer than 16 values remaining and they all fit into 8-bits then its smaller than storing stripes + If there are fewer than 8 values remaining and they all fit into 16-bits then its smaller than storing stripes + If there are fewer than 4 values remaining and they all fit into 32-bits then its smaller than storing stripes + */ + if (source_integers - (current_length - length_buffer) < 4) + { + largest = 0; + for (block = 0; block < 8; block++) + largest = max((uint8_t)largest, *(current_length + block)); + if (largest <= 8) + for (block = 0; block < 8; block++) + *(current_length + block) = 8; + else if (largest <= 16) + for (block = 0; block < 8; block++) + *(current_length + block) = 16; + else if (largest <= 32) + for (block = 0; block < 8; block++) + *(current_length + block) = 32; + } + else if (source_integers - (current_length - length_buffer) < 8) + { + largest = 0; + for (block = 0; block < 8; block++) + largest = max((uint8_t)largest, *(current_length + block)); + if (largest <= 8) + for (block = 0; block < 8; block++) + *(current_length + block) = 8; + else if (largest <= 8) + for (block = 0; block < 8; block++) + *(current_length + block) = 16; + } + else if (source_integers - (current_length - length_buffer) < 16) + { + largest = 0; + for (block = 0; block < 16; block++) + largest = max((uint8_t)largest, *(current_length + block)); + if (largest <= 8) + for (block = 0; block < 16; block++) + *(current_length + block) = 8; + } + /* + Otherwise we have the standard rules for a block + */ +#endif + /* + Two things need to happen to be able to use a particular selector. The first is that all the + values that would end up in that block need to use at most the bit value of that block. + The second is that there need to be at least as many numbers remaining as the block encodes. + + For example, if the current block only needs 0-bits per int, then check that the 256 values + that would be encoded only take 0-bits. If any value needs more, or there aren't 256 numbers remaining, + then promote the current block to try encode 128 1-bit values. + */ + switch (*current_length) + { + case 0: + if (source_integers - (current_length - length_buffer) < 256) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 1; // promote + break; + } + for (block = 0; block < 256; block += 4) + if (*(current_length + block) > 0) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 1; // promote + if (*current_length == 0) + { + for (block = 0; block < 256; block++) + current_length[block] = 0; + current_length += 256; + } + break; + case 1: + if (source_integers - (current_length - length_buffer) < 128) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 2; // promote + break; + } + for (block = 0; block < 128; block += 4) + if (*(current_length + block) > 1) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 2; // promote + if (*current_length == 1) + { + for (block = 0; block < 128; block++) + current_length[block] = 1; + current_length += 128; + } + break; + case 2: + if (source_integers - (current_length - length_buffer) < 64) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 3; // promote + break; + } + for (block = 0; block < 64; block += 4) + if (*(current_length + block) > 2) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 3; // promote + if (*current_length == 2) + { + for (block = 0; block < 64; block++) + current_length[block] = 2; + current_length += 64; + } + break; + case 3: + if (source_integers - (current_length - length_buffer) < 40) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 4; // promote + break; + } + for (block = 0; block < 40; block += 4) + if (*(current_length + block) > 3) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 4; // promote + if (*current_length == 3) + { + for (block = 0; block < 40; block++) + current_length[block] = 3; + current_length += 40; + } + break; + case 4: + if (source_integers - (current_length - length_buffer) < 32) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 5; // promote + break; + } + for (block = 0; block < 32; block += 4) + if (*(current_length + block) > 4) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 5; // promote + if (*current_length == 4) + { + for (block = 0; block < 32; block++) + current_length[block] = 4; + current_length += 32; + } + break; + case 5: + if (source_integers - (current_length - length_buffer) < 24) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 6; // promote + break; + } + for (block = 0; block < 24; block += 4) + if (*(current_length + block) > 5) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 6; // promote + if (*current_length == 5) + { + for (block = 0; block < 24; block++) + current_length[block] = 5; + current_length += 24; + } + break; + case 6: + if (source_integers - (current_length - length_buffer) < 20) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 7; // promote + break; + } + for (block = 0; block < 20; block += 4) + if (*(current_length + block) > 6) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 7; // promote + if (*current_length == 6) + { + for (block = 0; block < 20; block++) + current_length[block] = 6; + current_length += 20; + } + break; + case 7: + if (source_integers - (current_length - length_buffer) < 36) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 8; // promote + break; + } + for (block = 0; block < 36; block += 4) // 36 in a double 128-bit word + if (*(current_length + block) > 7) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 8; // promote + if (*current_length == 7) + { + for (block = 0; block < 36; block++) + current_length[block] = 7; + current_length += 36; + } + break; + case 8: + if (source_integers - (current_length - length_buffer) < 16) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 9; // promote + break; + } + for (block = 0; block < 16; block += 4) + if (*(current_length + block) > 8) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 9; // promote + if (*current_length == 8) + { + for (block = 0; block < 16; block++) + current_length[block] = 8; + current_length += 16; + } + break; + case 9: + if (source_integers - (current_length - length_buffer) < 28) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 10; // promote + break; + } + for (block = 0; block < 28; block += 4) // 28 in a double 128-bit word + if (*(current_length + block) > 9) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 10; // promote + if (*current_length == 9) + { + for (block = 0; block < 28; block++) + current_length[block] = 9; + current_length += 28; + } + break; + case 10: + if (source_integers - (current_length - length_buffer) < 12) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 12; // promote + break; + } + for (block = 0; block < 12; block += 4) + if (*(current_length + block) > 10) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 12; // promote + if (*current_length == 10) + { + for (block = 0; block < 12; block++) + current_length[block] = 10; + current_length += 12; + } + break; + case 12: + if (source_integers - (current_length - length_buffer) < 20) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 16; // promote + break; + } + for (block = 0; block < 20; block += 4) // 20 in a double 128-bit word + if (*(current_length + block) > 12) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 16; // promote + if (*current_length == 12) + { + for (block = 0; block < 20; block++) + current_length[block] = 12; + current_length += 20; + } + break; + case 16: + if (source_integers - (current_length - length_buffer) < 8) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 21; // promote + break; + } + for (block = 0; block < 8; block += 4) + if (*(current_length + block) > 16) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 21; // promote + if (*current_length == 16) + { + for (block = 0; block < 8; block++) + current_length[block] = 16; + current_length += 8; + } + break; + case 21: + if (source_integers - (current_length - length_buffer) < 12) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 32; // promote + break; + } + for (block = 0; block < 12; block += 4) // 12 in a double 128-bit word + if (*(current_length + block) > 21) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 32; // promote + if (*current_length == 21) + { + for (block = 0; block < 12; block++) + current_length[block] = 21; + current_length += 12; + } + break; + case 32: + if (source_integers - (current_length - length_buffer) < 4) + { + for (block = 0; block < (source_integers - (current_length - length_buffer)); block++) + *(current_length + block) = 128; // promote + break; + } + for (block = 0; block < 4; block += 4) + if (*(current_length + block) > 32) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 64; // promote + if (*current_length == 32) + { + for (block = 0; block < 4; block++) + current_length[block] = 32; + current_length += 4; + } + break; + case 128: + /* + The 128-bit selector is used as a last resort when there are not enough numbers to use an + earlier selector. So don't worry about checking the rest. + */ + current_length += source_integers - (current_length - length_buffer); + break; + default: + exit(printf("Selecting on a non whole power of 2, must exit\n")); + break; + } + } + +/* + We can now compress based on the lengths in length_buffer +*/ +run_length = 1; +bits = length_buffer[0]; +keys = length_buffer; // we're going to re-use the length_buffer because it can't overlap and this saves a double malloc +for (current = (uint32_t *)source + 1; current < source + source_integers; current++) + { + new_needed = length_buffer[current - source]; + if (new_needed == bits) + run_length++; + else + { + write_out(&destination, (uint32_t *)current - run_length, run_length, bits, &keys); + bits = new_needed; + run_length = 1; + } + } +write_out(&destination, (uint32_t *)current - run_length, run_length, bits, &keys); + +/* + Copy the lengths to the end, backwards +*/ +uint8_t *from = length_buffer + (keys - length_buffer) - 1; +uint8_t *to = destination; +for (uint32_t pos = 0; pos < keys - length_buffer; pos++) + *to++ = *from--; +destination += keys - length_buffer; + +/* + Compute the length (in bytes) +*/ +*nvalue = destination - (uint8_t *)into; // return length in bytes +} + +#ifdef MAKE_DECOMPRESS + /* + The following program generates the source code for ANT_compress_qmx_v3::decodeArray() + */ + /* + MAIN() + ------ + This version assumes SSE4.1 and so it is *not* portable to non X86 architectures + */ + int main(void) + { + uint32_t instance; + + printf("static uint32_t ALIGN_16 static_mask_21[] = {0x1fffff, 0x1fffff, 0x1fffff, 0x1fffff};\n"); + printf("static uint32_t ALIGN_16 static_mask_12[] = {0xfff, 0xfff, 0xfff, 0xfff};\n"); + printf("static uint32_t ALIGN_16 static_mask_10[] = {0x3ff, 0x3ff, 0x3ff, 0x3ff};\n"); + printf("static uint32_t ALIGN_16 static_mask_9[] = {0x1ff, 0x1ff, 0x1ff, 0x1ff};\n"); + printf("static uint32_t ALIGN_16 static_mask_7[] = {0x7f, 0x7f, 0x7f, 0x7f};\n"); + printf("static uint32_t ALIGN_16 static_mask_6[] = {0x3f, 0x3f, 0x3f, 0x3f};\n"); + printf("static uint32_t ALIGN_16 static_mask_5[] = {0x1f, 0x1f, 0x1f, 0x1f};\n"); + printf("static uint32_t ALIGN_16 static_mask_4[] = {0x0f, 0x0f, 0x0f, 0x0f};\n"); + printf("static uint32_t ALIGN_16 static_mask_3[] = {0x07, 0x07, 0x07, 0x07};\n"); + printf("static uint32_t ALIGN_16 static_mask_2[] = {0x03, 0x03, 0x03, 0x03};\n"); + printf("static uint32_t ALIGN_16 static_mask_1[] = {0x01, 0x01, 0x01, 0x01};\n"); + printf("void ANT_compress_qmx_v3::decodeArray(const uint32_t *source, uint64_t len, uint32_t *to, uint64_t destination_integers)\n"); + printf("{\n"); + printf("__m128i mask_21, mask_12, mask_10, mask_9, mask_7, mask_6, mask_5, mask_4, mask_3, mask_2, mask_1;\n"); + printf("uint8_t *in = (uint8_t *)source;\n"); + printf("uint8_t *keys = ((uint8_t *)source) + len - 1;\n"); + + printf("\n"); + printf("mask_21 = _mm_loadu_si128((__m128i *)static_mask_21);\n"); + printf("mask_12 = _mm_loadu_si128((__m128i *)static_mask_12);\n"); + printf("mask_10 = _mm_loadu_si128((__m128i *)static_mask_10);\n"); + printf("mask_9 = _mm_loadu_si128((__m128i *)static_mask_9);\n"); + printf("mask_7 = _mm_loadu_si128((__m128i *)static_mask_7);\n"); + printf("mask_6 = _mm_loadu_si128((__m128i *)static_mask_6);\n"); + printf("mask_5 = _mm_loadu_si128((__m128i *)static_mask_5);\n"); + printf("mask_4 = _mm_loadu_si128((__m128i *)static_mask_4);\n"); + printf("mask_3 = _mm_loadu_si128((__m128i *)static_mask_3);\n"); + printf("mask_2 = _mm_loadu_si128((__m128i *)static_mask_2);\n"); + printf("mask_1 = _mm_loadu_si128((__m128i *)static_mask_1);\n"); + printf("\n"); + + printf("while (in <= keys) // <= because there can be a boundary case where the final key is 255*0 bit integers\n"); + printf("\t{\n"); + printf("\tswitch (*keys--)\n"); + printf("\t\t{\n"); + + for (instance = 0; instance <= 0xFF; instance++) + { + printf("\t\tcase 0x%02x:\n", instance); + printf("\t\t\t{\n"); + if ((instance >> 4) == 0) + { + /* + 256 0-bit integers + */ + printf("#ifdef NO_ZEROS\n"); + printf("\t\t\tconst __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1);\n"); + printf("#else\n"); + printf("\t\t\tconst __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));\n"); + printf("#endif\n"); + + for (uint32_t run = 0; run < 0x10 - (instance & 0x0F); run++) + { + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 1, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 2, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 3, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 4, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 5, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 6, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 7, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 8, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 9, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 10, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 11, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 12, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 13, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 14, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 15, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 16, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 17, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 18, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 19, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 20, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 21, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 22, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 23, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 24, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 25, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 26, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 27, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 28, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 29, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 30, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 31, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 32, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 33, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 34, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 35, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 36, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 37, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 38, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 39, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 40, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 41, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 42, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 43, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 44, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 45, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 46, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 47, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 48, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 49, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 50, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 51, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 52, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 53, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 54, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 55, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 56, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 57, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 58, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 59, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 60, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 61, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 62, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 63, tmp);\n", run * 64); + printf("\n"); + } + printf("\t\t\tto += %d;\n", 256 * (0x10 - (instance & 0x0F))); // becomes 256 integers + printf("\t\t\tbreak;\n"); + } + else if (instance >> 4 == 1) + { + /* + 128 * 1-bit integers + */ + for (uint32_t run = 0; run < 0x10 - (instance & 0x0F); run++) + { + printf("{\n"); + printf("\t\t\tconst __m128i byte_stream = _mm_loadu_si128((__m128i *)in + %d);\n", run); + + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d, _mm_and_si128(byte_stream, mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));\n", run * 32); + printf("}\n"); + printf("\n"); + } + printf("\t\t\tin += %d;\n", 16 * (0x10 - (instance & 0x0F))); // 16 bytes + printf("\t\t\tto += %d;\n", 128 * (0x10 - (instance & 0x0F))); // becomes 128 integers + printf("\t\t\tbreak;\n"); + } + else if (instance >> 4 == 2) + { + /* + 64 * 2-bit integers + */ + for (uint32_t run = 0; run < 0x10 - (instance & 0x0F); run++) + { + printf("{\n"); + + printf("\t\t\tconst __m128i byte_stream = _mm_loadu_si128((__m128i *)in + %d);\n", run); + + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d, _mm_and_si128(byte_stream, mask_2));\n", run * 16); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));\n", run * 16); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));\n", run * 16); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));\n", run * 16); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));\n", run * 16); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));\n", run * 16); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));\n", run * 16); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));\n", run * 16); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));\n", run * 16); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));\n", run * 16); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));\n", run * 16); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));\n", run * 16); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));\n", run * 16); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));\n", run * 16); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));\n", run * 16); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));\n", run * 16); + printf("}\n"); + printf("\n"); + } + + printf("\t\t\tin += %d;\n", 16 * (0x10 - (instance & 0x0F))); // 16 bytes + printf("\t\t\tto += %d;\n", 64 * (0x10 - (instance & 0x0F))); // becomes 64 integers + printf("\t\t\tbreak;\n"); + } + else if (instance >> 4 == 3) + { + /* + 40 * 3-bit integers + */ + for (uint32_t run = 0; run < 0x10 - (instance & 0x0F); run++) + { + printf("{\n"); + printf("\t\t\tconst __m128i byte_stream = _mm_loadu_si128((__m128i *)in + %d);\n", run); + + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d, _mm_and_si128(byte_stream, mask_3));\n", run * 10); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));\n", run * 10); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));\n", run * 10); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));\n", run * 10); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));\n", run * 10); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));\n", run * 10); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));\n", run * 10); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));\n", run * 10); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));\n", run * 10); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));\n", run * 10); + printf("}\n"); + printf("\n"); + } + + printf("\t\t\tin += %d;\n", 16 * (0x10 - (instance & 0x0F))); // 16 bytes + printf("\t\t\tto += %d;\n", 40 * (0x10 - (instance & 0x0F))); // becomes 40 integers + printf("\t\t\tbreak;\n"); + } + else if (instance >> 4 == 4) + { + /* + 32 * 4-bit integers + */ + for (uint32_t run = 0; run < 0x10 - (instance & 0x0F); run++) + { + printf("{\n"); + printf("\t\t\tconst __m128i byte_stream = _mm_loadu_si128((__m128i *)in + %d);\n", run); + + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d, _mm_and_si128(byte_stream, mask_4));\n", run * 8); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));\n", run * 8); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));\n", run * 8); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));\n", run * 8); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));\n", run * 8); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));\n", run * 8); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));\n", run * 8); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));\n", run * 8); + printf("}\n"); + + printf("\n"); + } + + printf("\t\t\tin += %d;\n", 16 * (0x10 - (instance & 0x0F))); // 16 bytes + printf("\t\t\tto += %d;\n", 32 * (0x10 - (instance & 0x0F))); // becomes 32 integers + printf("\t\t\tbreak;\n"); + } + else if (instance >> 4 == 5) + { + /* + 24 * 5-bit integers + */ + for (uint32_t run = 0; run < 0x10 - (instance & 0x0F); run++) + { + printf("{\n"); + printf("\t\t\tconst __m128i byte_stream = _mm_loadu_si128((__m128i *)in + %d);\n", run); + + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d, _mm_and_si128(byte_stream, mask_5));\n", run * 6); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));\n", run * 6); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));\n", run * 6); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));\n", run * 6); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));\n", run * 6); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));\n", run * 6); + printf("}\n"); + printf("\n"); + } + printf("\t\t\tin += %d;\n", 16 * (0x10 - (instance & 0x0F))); // 16 bytes + printf("\t\t\tto += %d;\n", 24 * (0x10 - (instance & 0x0F))); // becomes 24 integers + printf("\t\t\tbreak;\n"); + } + else if (instance >> 4 == 6) + { + /* + 20 * 6-bit integers + */ + for (uint32_t run = 0; run < 0x10 - (instance & 0x0F); run++) + { + printf("{\n"); + printf("\t\t\tconst __m128i byte_stream = _mm_loadu_si128((__m128i *)in + %d);\n", run); + + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d, _mm_and_si128(byte_stream, mask_6));\n", run * 5); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));\n", run * 5); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));\n", run * 5); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));\n", run * 5); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));\n", run * 5); + printf("}\n"); + printf("\n"); + } + printf("\t\t\tin += %d;\n", 16 * (0x10 - (instance & 0x0F))); // 16 bytes + printf("\t\t\tto += %d;\n", 20 * (0x10 - (instance & 0x0F))); // becomes 20 integers + printf("\t\t\tbreak;\n"); + } + else if (instance >> 4 == 7) + { + /* + 36 * 7 bit integers (in two 128-bit words) + */ + for (uint32_t run = 0; run < 0x10 - (instance & 0x0F); run++) + { + printf("{\n"); + printf("\t\t\tconst __m128i byte_stream = _mm_loadu_si128((__m128i *)in + %d);\n", run * 2); + + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d, _mm_and_si128(byte_stream, mask_7));\n", run * 9); + + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));\n", run * 9); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));\n", run * 9); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));\n", run * 9); + + printf("\t\t\tconst __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + %d);\n", run * 2 + 1); + + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));\n", run * 9); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));\n", run * 9); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));\n", run * 9); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));\n", run * 9); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));\n", run * 9); + printf("}\n"); + printf("\n"); + } + + printf("\t\t\tin += %d;\n", 32 * (0x10 - (instance & 0x0F))); // 32 bytes + printf("\t\t\tto += %d;\n", 36 * (0x10 - (instance & 0x0F))); // becomes 36 integers + printf("\t\t\tbreak;\n"); + } + else if (instance >> 4 == 8) + { + /* + 16 * 8-bit integers + */ + for (uint32_t run = 0; run < 0x10 - (instance & 0x0F); run++) + { + printf("{\n"); + printf("\t\t\t const __m128i tmp = _mm_loadu_si128((__m128i *)in + %d);\n", run); + + printf("\t\t\t _mm_storeu_si128((__m128i *)to + %d, _mm_cvtepu8_epi32(tmp));\n", run * 4); + printf("\t\t\t _mm_storeu_si128((__m128i *)to + %d + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));\n", run * 4); + printf("\t\t\t const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));\n"); + printf("\t\t\t _mm_storeu_si128((__m128i *)to + %d + 2, _mm_cvtepu8_epi32(tmp3));\n", run * 4); + printf("\t\t\t _mm_storeu_si128((__m128i *)to + %d + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));\n", run * 4); + printf("}\n"); + printf("\n"); + } + printf("\t\t\tin += %d;\n", 16 * (0x10 - (instance & 0x0F))); // 16 bytes + printf("\t\t\tto += %d;\n", 16 * (0x10 - (instance & 0x0F))); // becomes 16 integers + printf("\t\t\tbreak;\n"); + } + else if (instance >> 4 == 9) + { + /* + 28 * 9-bit ingtegers (in two 128-bit words) + */ + for (uint32_t run = 0; run < 0x10 - (instance & 0x0F); run++) + { + printf("{\n"); + printf("\t\t\tconst __m128i byte_stream = _mm_loadu_si128((__m128i *)in + %d);\n", run * 2); + + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d, _mm_and_si128(byte_stream, mask_9));\n", run * 7); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));\n", run * 7); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));\n", run * 7); + + printf("\t\t\tconst __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + %d);\n", run * 2 + 1); + + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));\n", run * 7); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));\n", run * 7); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));\n", run * 7); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));\n", run * 7); + printf("}\n"); + printf("\n"); + } + + printf("\t\t\tin += %d;\n", 32 * (0x10 - (instance & 0x0F))); // 32 bytes + printf("\t\t\tto += %d;\n", 28 * (0x10 - (instance & 0x0F))); // becomes 28 integers + printf("\t\t\tbreak;\n"); + } + else if (instance >> 4 == 10) + { + /* + 12 * 10-bit integers + */ + for (uint32_t run = 0; run < 0x10 - (instance & 0x0F); run++) + { + printf("{\n"); + printf("\t\t\t const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + %d);\n", run); + + printf("\t\t\t _mm_storeu_si128((__m128i *)to + %d, _mm_and_si128(byte_stream, mask_10));\n", run * 3); + printf("\t\t\t _mm_storeu_si128((__m128i *)to + %d + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));\n", run * 3); + printf("\t\t\t _mm_storeu_si128((__m128i *)to + %d + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));\n", run * 3); + printf("}\n"); + printf("\n"); + } + + printf("\t\t\tin += %d;\n", 16 * (0x10 - (instance & 0x0F))); // 16 bytes + printf("\t\t\tto += %d;\n", 12 * (0x10 - (instance & 0x0F))); // becomes 12 integers + printf("\t\t\tbreak;\n"); + } + else if (instance >> 4 == 11) + { + /* + 20 * 12-bit ingtegers (in two 128-bit words) + */ + for (uint32_t run = 0; run < 0x10 - (instance & 0x0F); run++) + { + printf("{\n"); + printf("\t\t\tconst __m128i byte_stream = _mm_loadu_si128((__m128i *)in + %d);\n", run * 2); + + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d, _mm_and_si128(byte_stream, mask_12));\n", run * 5); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));\n", run * 5); + + + printf("\t\t\tconst __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + %d);\n", run * 2 + 1); + + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));\n", run * 5); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));\n", run * 5); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));\n", run * 5); + + printf("}\n"); + printf("\n"); + } + printf("\t\t\tin += %d;\n", 32 * (0x10 - (instance & 0x0F))); // 32 bytes + printf("\t\t\tto += %d;\n", 20 * (0x10 - (instance & 0x0F))); // becomes 20 integers + printf("\t\t\tbreak;\n"); + } + else if (instance >> 4 == 12) + { + /* + 16-bit integers + */ + for (uint32_t run = 0; run < 0x10 - (instance & 0x0F); run++) + { + printf("{\n"); + printf("\t\t\tconst __m128i tmp = _mm_loadu_si128((__m128i *)in + %d);\n", run); + + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d, _mm_cvtepu16_epi32(tmp));\n", 2 * run); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));\n", 2 * run); + + printf("}\n"); + printf("\n"); + } + + printf("\t\t\tin += %d;\n", 16 * (0x10 - (instance & 0x0F))); // 16 bytes + printf("\t\t\tto += %d;\n", 8 * (0x10 - (instance & 0x0F))); // becomes 8 integers + printf("\t\t\tbreak;\n"); + } + else if (instance >> 4 == 13) + { + /* + 12 * 21-bit ingtegers (in two 128-bit words) + */ + for (uint32_t run = 0; run < 0x10 - (instance & 0x0F); run++) + { + printf("{\n"); + printf("\t\t\tconst __m128i byte_stream = _mm_loadu_si128((__m128i *)in + %d);\n", run * 2); + + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d, _mm_and_si128(byte_stream, mask_21));\n", run * 3); + + printf("\t\t\tconst __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + %d);\n", run * 2 + 1); + + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));\n", run * 3); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));\n", run * 3); + + printf("}\n"); + printf("\n"); + } + printf("\t\t\tin += %d;\n", 32 * (0x10 - (instance & 0x0F))); // 32 bytes + printf("\t\t\tto += %d;\n", 12 * (0x10 - (instance & 0x0F))); // becomes 8 integers + printf("\t\t\tbreak;\n"); + } + else if (instance >> 4 == 14) + { + /* + 32-bit integers + */ + for (uint32_t run = 0; run < 0x10 - (instance & 0x0F); run++) + { + printf("{\n"); + printf("\t\t\t const __m128i tmp = _mm_loadu_si128((__m128i *)in + %d);\n", run); + + printf("\t\t\t _mm_storeu_si128((__m128i *)to + %d, tmp);\n", run); + printf("}\n"); + printf("\n"); + } + + printf("\t\t\tin += %d;\n", 16 * (0x10 - (instance & 0x0F))); // 16 bytes + printf("\t\t\tto += %d;\n", 4 * (0x10 - (instance & 0x0F))); // becomes 4 integers + printf("\t\t\tbreak;\n"); + } + else if (instance >> 4 == 15) + { + /* + 128-bit integers + if there are fewer than 4 integes then we just bit-pack them in to 8, 16, 24, or 32-bit words + */ + if ((instance & 0x0C) == 0x00) + { + for (uint32_t run = 0; run < 0x04 - (instance & 0x03); run++) + printf("\t\t\t*(to + %d) = *(uint8_t *)(in + %d);\n", run, run); + printf("\t\t\tin += %d;\n", 0x04 - (instance & 0x03)); // 1 byte integer + printf("\t\t\tto += %d;\n", 0x04 - (instance & 0x03)); // becomes 1 integer + } + else if ((instance & 0x0C) == 0x04) + { + for (uint32_t run = 0; run < 0x04 - (instance & 0x03); run++) + printf("\t\t\t*(to + %d) = *(uint16_t *)(in + 2 * %d);\n", run, run); + printf("\t\t\tin += 2 * %d;\n", 0x04 - (instance & 0x03)); // 2 byte integers + printf("\t\t\tto += %d;\n", 0x04 - (instance & 0x03)); // becomes 1 integer + } + else if ((instance & 0x0C) == 0x08) + { + for (uint32_t run = 0; run < 0x04 - (instance & 0x03); run++) + printf("\t\t\t*(to + %d) = (*(uint8_t *)(in + 3 * %d) << 16) | (*(uint8_t *)(in + 3 * %d + 1) << 8) | (*(uint8_t *)(in + 3 * %d + 2));\n", run, run, run, run); + printf("\t\t\tin += 3 * %d;\n", 0x04 - (instance & 0x03)); // 3 byte integer + printf("\t\t\tto += %d;\n", 0x04 - (instance & 0x03)); // becomes 1 integer + } + else if ((instance & 0x0C) == 0x0C) + { + for (uint32_t run = 0; run < 0x04 - (instance & 0x03); run++) + printf("\t\t\t*(to + %d) = *(uint32_t *)(in + 4 * %d);\n", run, run); + printf("\t\t\tin += 4 * %d;\n", 0x04 - (instance & 0x03)); // 4 byte integer + printf("\t\t\tto += %d;\n", 0x04 - (instance & 0x03)); // becomes 1 integer + } + printf("\t\t\tbreak;\n"); + } + else + { + printf("\t\t\tin++;\n"); // dummy, can't occur + } + printf("\t\t\t}\n"); + } + printf("\t\t}\n"); + printf("\t}\n"); + printf("}\n"); + } + +#endif + +#ifdef TEST_ONE_STRING + static uint32_t sequence[]={13,1,1,26,18,3,1,9,4,8,5,19,7,26,1,5,7,3,12,5,39,16,3,5,19,8,18,1,1,1,2,5,9,3,21,2,6,37,3,5,5,18,3,31,3,22,5,17,6,12,6,2,5,10,3,12,51,14,7,8,1,2,3,27,19,1,10,8,2,7,2,9,16,6,6,5,6,4,18,21,13,2,1,11,3,22,2,16,13,61,21,12,51,10,6,31,14,65,15,82,5,4,18,3,1,1,4,34,5,9,4,7,1,25,17,52,60,8,8,4,22,7,49,26,2,72,29,33,6,11,3,8,1,23,37,1,3,1,1,1,3,20,6,1,2,1,1,1,14,2,4,1,6,4,4,3,1,1,2,2,1,9,29,1,10,11,4,10,31}; + + static uint32_t second_compress_buffer[100000]; + static uint32_t second_decompress_buffer[100000]; + + uint32_t second_compress_buffer_size = sizeof(second_compress_buffer) / sizeof(*second_compress_buffer); + uint32_t second_decompress_buffer_size = sizeof(second_decompress_buffer) / sizeof(*second_decompress_buffer); + + /* + CHECK() + ------- + */ + void check(uint32_t *sequence, uint32_t sequence_length) + { + ANT_compress_qmx_v3 compressor; + uint64_t buffer_size; + uint32_t pos; + uint32_t fail; + + memset(second_compress_buffer, 0, second_compress_buffer_size); + memset(second_decompress_buffer, 0, second_decompress_buffer_size); + + compressor.encodeArray(sequence, sequence_length, (uint32_t *)second_compress_buffer, &buffer_size); + second_compress_buffer[buffer_size] = 0; + second_compress_buffer[buffer_size + 1] = 0; + second_compress_buffer[buffer_size + 2] = 0; + second_compress_buffer[buffer_size + 3] = 0; + + for (pos = 0; pos < buffer_size; pos++) + printf("%02X ", ((uint8_t *)second_compress_buffer)[pos]); + puts(""); + + compressor.decodeArray((uint32_t *)second_compress_buffer, buffer_size, (uint32_t *)second_decompress_buffer, sequence_length); + + fail = false; + for (pos = 0; pos < sequence_length; pos++) + if (sequence[pos] != second_decompress_buffer[pos]) + { + printf("p[%d]:%X != %X\n", pos, sequence[pos], second_decompress_buffer[pos]); + fail = true; + } + else + printf("p[%d]:%X == %X\n", pos, sequence[pos], second_decompress_buffer[pos]); + + if (fail) + puts("Test failed"); + else + puts("Test succeeded"); + } + + /* + MAIN() + ------ + */ + int main(void) + { + check(sequence, sizeof(sequence) / sizeof(*sequence)); + } +#endif +/* + ANT_ANT_COMPRESS_QMX_V3::DECODEARRAY() + -------------------------------- + this code was generated by the method above. +*/ +#include "compress_qmx_v3_decompress.cpp" diff --git a/ext/bench_/bench/compress_qmx_v3.h b/ext/bench_/bench/compress_qmx_v3.h new file mode 100644 index 0000000..20cab0f --- /dev/null +++ b/ext/bench_/bench/compress_qmx_v3.h @@ -0,0 +1,47 @@ +/* + COMPRESS_QMX_V3.H + ------------------ + QMX with: + no overflow (Matt's changes) + no VB lengths (backwards scanning selectors) + loop unwinding +*/ +#ifndef COMPRESS_QMX3_H_ +#define COMPRESS_QMX3_H_ + +#include +#include "compress.h" + +/* + class ANT_COMPRESS_QMX_V3 + ------------------------- +*/ +class ANT_compress_qmx_v3 : public ANT_compress + { + private: + uint8_t *length_buffer; + uint64_t length_buffer_length; + + public: + ANT_compress_qmx_v3(); + virtual ~ANT_compress_qmx_v3(); + + void encodeArray(const uint32_t *in, uint64_t len, uint32_t *out, uint64_t *nvalue); + static void decodeArray(const uint32_t *in, uint64_t len, uint32_t *out, uint64_t nvalue); + + virtual uint64_t compress(uint8_t *destination, uint64_t destination_length, uint32_t *source, uint64_t source_integers) + { + uint64_t answer; + encodeArray(source, source_integers, (uint32_t *)destination, &answer); + return answer; + } + + virtual void decompress(uint32_t *destination, uint64_t destinaton_integers, uint8_t *source, uint64_t source_length) + { + decodeArray((uint32_t *)source, source_length, destination, destinaton_integers); + } + } ; + +#endif + + diff --git a/ext/bench_/bench/compress_qmx_v3_decompress.cpp b/ext/bench_/bench/compress_qmx_v3_decompress.cpp new file mode 100644 index 0000000..a54e12d --- /dev/null +++ b/ext/bench_/bench/compress_qmx_v3_decompress.cpp @@ -0,0 +1,33908 @@ +static uint32_t ALIGN_16 static_mask_21[] = {0x1fffff, 0x1fffff, 0x1fffff, 0x1fffff}; +static uint32_t ALIGN_16 static_mask_12[] = {0xfff, 0xfff, 0xfff, 0xfff}; +static uint32_t ALIGN_16 static_mask_10[] = {0x3ff, 0x3ff, 0x3ff, 0x3ff}; +static uint32_t ALIGN_16 static_mask_9[] = {0x1ff, 0x1ff, 0x1ff, 0x1ff}; +static uint32_t ALIGN_16 static_mask_7[] = {0x7f, 0x7f, 0x7f, 0x7f}; +static uint32_t ALIGN_16 static_mask_6[] = {0x3f, 0x3f, 0x3f, 0x3f}; +static uint32_t ALIGN_16 static_mask_5[] = {0x1f, 0x1f, 0x1f, 0x1f}; +static uint32_t ALIGN_16 static_mask_4[] = {0x0f, 0x0f, 0x0f, 0x0f}; +static uint32_t ALIGN_16 static_mask_3[] = {0x07, 0x07, 0x07, 0x07}; +static uint32_t ALIGN_16 static_mask_2[] = {0x03, 0x03, 0x03, 0x03}; +static uint32_t ALIGN_16 static_mask_1[] = {0x01, 0x01, 0x01, 0x01}; +void ANT_compress_qmx_v3::decodeArray(const uint32_t *source, uint64_t len, uint32_t *to, uint64_t destination_integers) +{ +__m128i mask_21, mask_12, mask_10, mask_9, mask_7, mask_6, mask_5, mask_4, mask_3, mask_2, mask_1; +uint8_t *in = (uint8_t *)source; +uint8_t *keys = ((uint8_t *)source) + len - 1; + +mask_21 = _mm_loadu_si128((__m128i *)static_mask_21); +mask_12 = _mm_loadu_si128((__m128i *)static_mask_12); +mask_10 = _mm_loadu_si128((__m128i *)static_mask_10); +mask_9 = _mm_loadu_si128((__m128i *)static_mask_9); +mask_7 = _mm_loadu_si128((__m128i *)static_mask_7); +mask_6 = _mm_loadu_si128((__m128i *)static_mask_6); +mask_5 = _mm_loadu_si128((__m128i *)static_mask_5); +mask_4 = _mm_loadu_si128((__m128i *)static_mask_4); +mask_3 = _mm_loadu_si128((__m128i *)static_mask_3); +mask_2 = _mm_loadu_si128((__m128i *)static_mask_2); +mask_1 = _mm_loadu_si128((__m128i *)static_mask_1); + +while (in <= keys) // <= because there can be a boundary case where the final key is 255*0 bit integers + { + switch (*keys--) + { + case 0x00: + { +#ifdef NO_ZEROS + const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); +#endif + _mm_storeu_si128((__m128i *)to + 0, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 64, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 128, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 192, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 256, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 320, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 384, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 448, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 512, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 576, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 640, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 704, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 768, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 832, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 896, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 960, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 63, tmp); + + to += 4096; + break; + } + case 0x01: + { +#ifdef NO_ZEROS + const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); +#endif + _mm_storeu_si128((__m128i *)to + 0, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 64, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 128, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 192, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 256, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 320, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 384, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 448, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 512, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 576, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 640, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 704, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 768, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 832, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 896, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 63, tmp); + + to += 3840; + break; + } + case 0x02: + { +#ifdef NO_ZEROS + const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); +#endif + _mm_storeu_si128((__m128i *)to + 0, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 64, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 128, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 192, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 256, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 320, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 384, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 448, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 512, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 576, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 640, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 704, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 768, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 832, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 63, tmp); + + to += 3584; + break; + } + case 0x03: + { +#ifdef NO_ZEROS + const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); +#endif + _mm_storeu_si128((__m128i *)to + 0, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 64, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 128, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 192, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 256, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 320, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 384, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 448, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 512, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 576, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 640, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 704, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 768, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 63, tmp); + + to += 3328; + break; + } + case 0x04: + { +#ifdef NO_ZEROS + const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); +#endif + _mm_storeu_si128((__m128i *)to + 0, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 64, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 128, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 192, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 256, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 320, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 384, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 448, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 512, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 576, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 640, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 704, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 63, tmp); + + to += 3072; + break; + } + case 0x05: + { +#ifdef NO_ZEROS + const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); +#endif + _mm_storeu_si128((__m128i *)to + 0, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 64, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 128, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 192, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 256, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 320, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 384, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 448, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 512, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 576, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 640, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 63, tmp); + + to += 2816; + break; + } + case 0x06: + { +#ifdef NO_ZEROS + const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); +#endif + _mm_storeu_si128((__m128i *)to + 0, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 64, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 128, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 192, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 256, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 320, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 384, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 448, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 512, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 576, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 63, tmp); + + to += 2560; + break; + } + case 0x07: + { +#ifdef NO_ZEROS + const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); +#endif + _mm_storeu_si128((__m128i *)to + 0, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 64, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 128, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 192, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 256, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 320, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 384, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 448, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 512, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 63, tmp); + + to += 2304; + break; + } + case 0x08: + { +#ifdef NO_ZEROS + const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); +#endif + _mm_storeu_si128((__m128i *)to + 0, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 64, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 128, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 192, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 256, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 320, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 384, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 448, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 63, tmp); + + to += 2048; + break; + } + case 0x09: + { +#ifdef NO_ZEROS + const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); +#endif + _mm_storeu_si128((__m128i *)to + 0, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 64, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 128, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 192, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 256, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 320, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 384, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 63, tmp); + + to += 1792; + break; + } + case 0x0a: + { +#ifdef NO_ZEROS + const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); +#endif + _mm_storeu_si128((__m128i *)to + 0, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 64, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 128, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 192, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 256, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 320, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 63, tmp); + + to += 1536; + break; + } + case 0x0b: + { +#ifdef NO_ZEROS + const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); +#endif + _mm_storeu_si128((__m128i *)to + 0, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 64, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 128, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 192, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 256, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); + + to += 1280; + break; + } + case 0x0c: + { +#ifdef NO_ZEROS + const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); +#endif + _mm_storeu_si128((__m128i *)to + 0, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 64, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 128, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 192, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); + + to += 1024; + break; + } + case 0x0d: + { +#ifdef NO_ZEROS + const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); +#endif + _mm_storeu_si128((__m128i *)to + 0, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 64, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 128, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); + + to += 768; + break; + } + case 0x0e: + { +#ifdef NO_ZEROS + const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); +#endif + _mm_storeu_si128((__m128i *)to + 0, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 64, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); + + to += 512; + break; + } + case 0x0f: + { +#ifdef NO_ZEROS + const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); +#endif + _mm_storeu_si128((__m128i *)to + 0, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); + + to += 256; + break; + } + case 0x10: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 288, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 320, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 352, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 384, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 416, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 448, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 480, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + + in += 256; + to += 2048; + break; + } + case 0x11: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 288, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 320, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 352, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 384, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 416, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 448, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + + in += 240; + to += 1920; + break; + } + case 0x12: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 288, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 320, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 352, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 384, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 416, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + + in += 224; + to += 1792; + break; + } + case 0x13: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 288, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 320, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 352, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 384, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + + in += 208; + to += 1664; + break; + } + case 0x14: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 288, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 320, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 352, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + + in += 192; + to += 1536; + break; + } + case 0x15: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 288, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 320, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + + in += 176; + to += 1408; + break; + } + case 0x16: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 288, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + + in += 160; + to += 1280; + break; + } + case 0x17: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + + in += 144; + to += 1152; + break; + } + case 0x18: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + + in += 128; + to += 1024; + break; + } + case 0x19: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + + in += 112; + to += 896; + break; + } + case 0x1a: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + + in += 96; + to += 768; + break; + } + case 0x1b: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + + in += 80; + to += 640; + break; + } + case 0x1c: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + + in += 64; + to += 512; + break; + } + case 0x1d: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + + in += 48; + to += 384; + break; + } + case 0x1e: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + + in += 32; + to += 256; + break; + } + case 0x1f: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + + in += 16; + to += 128; + break; + } + case 0x20: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 144, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 176, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 208, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 240, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 240 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 240 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 240 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 240 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 240 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 240 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 240 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 240 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 240 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 240 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 240 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 240 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 240 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 240 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 240 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + + in += 256; + to += 1024; + break; + } + case 0x21: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 144, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 176, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 208, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + + in += 240; + to += 960; + break; + } + case 0x22: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 144, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 176, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 208, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + + in += 224; + to += 896; + break; + } + case 0x23: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 144, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 176, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + + in += 208; + to += 832; + break; + } + case 0x24: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 144, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 176, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + + in += 192; + to += 768; + break; + } + case 0x25: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 144, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + + in += 176; + to += 704; + break; + } + case 0x26: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 144, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + + in += 160; + to += 640; + break; + } + case 0x27: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + + in += 144; + to += 576; + break; + } + case 0x28: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + + in += 128; + to += 512; + break; + } + case 0x29: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + + in += 112; + to += 448; + break; + } + case 0x2a: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + + in += 96; + to += 384; + break; + } + case 0x2b: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + + in += 80; + to += 320; + break; + } + case 0x2c: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + + in += 64; + to += 256; + break; + } + case 0x2d: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + + in += 48; + to += 192; + break; + } + case 0x2e: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + + in += 32; + to += 128; + break; + } + case 0x2f: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + + in += 16; + to += 64; + break; + } + case 0x30: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 100, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 110, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 120, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 130, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 140, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 150, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 150 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 150 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 150 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 150 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 150 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 150 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 150 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 150 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 150 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + + in += 256; + to += 640; + break; + } + case 0x31: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 100, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 110, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 120, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 130, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 140, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + + in += 240; + to += 600; + break; + } + case 0x32: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 100, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 110, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 120, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 130, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + + in += 224; + to += 560; + break; + } + case 0x33: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 100, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 110, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 120, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + + in += 208; + to += 520; + break; + } + case 0x34: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 100, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 110, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + + in += 192; + to += 480; + break; + } + case 0x35: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 100, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + + in += 176; + to += 440; + break; + } + case 0x36: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + + in += 160; + to += 400; + break; + } + case 0x37: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + + in += 144; + to += 360; + break; + } + case 0x38: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + + in += 128; + to += 320; + break; + } + case 0x39: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + + in += 112; + to += 280; + break; + } + case 0x3a: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + + in += 96; + to += 240; + break; + } + case 0x3b: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + + in += 80; + to += 200; + break; + } + case 0x3c: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + + in += 64; + to += 160; + break; + } + case 0x3d: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + + in += 48; + to += 120; + break; + } + case 0x3e: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + + in += 32; + to += 80; + break; + } + case 0x3f: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + + in += 16; + to += 40; + break; + } + case 0x40: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 88, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 104, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 120, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 120 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 120 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 120 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 120 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 120 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 120 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 120 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + + in += 256; + to += 512; + break; + } + case 0x41: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 88, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 104, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + + in += 240; + to += 480; + break; + } + case 0x42: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 88, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 104, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + + in += 224; + to += 448; + break; + } + case 0x43: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 88, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + + in += 208; + to += 416; + break; + } + case 0x44: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 88, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + + in += 192; + to += 384; + break; + } + case 0x45: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + + in += 176; + to += 352; + break; + } + case 0x46: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + + in += 160; + to += 320; + break; + } + case 0x47: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + + in += 144; + to += 288; + break; + } + case 0x48: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + + in += 128; + to += 256; + break; + } + case 0x49: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + + in += 112; + to += 224; + break; + } + case 0x4a: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + + in += 96; + to += 192; + break; + } + case 0x4b: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + + in += 80; + to += 160; + break; + } + case 0x4c: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + + in += 64; + to += 128; + break; + } + case 0x4d: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + + in += 48; + to += 96; + break; + } + case 0x4e: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + + in += 32; + to += 64; + break; + } + case 0x4f: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + + in += 16; + to += 32; + break; + } + case 0x50: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 66, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 78, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 78 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 78 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 78 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 78 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 78 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 84, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 84 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 84 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 84 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 84 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 84 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + + in += 256; + to += 384; + break; + } + case 0x51: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 66, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 78, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 78 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 78 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 78 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 78 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 78 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 84, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 84 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 84 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 84 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 84 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 84 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + + in += 240; + to += 360; + break; + } + case 0x52: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 66, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 78, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 78 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 78 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 78 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 78 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 78 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + + in += 224; + to += 336; + break; + } + case 0x53: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 66, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + + in += 208; + to += 312; + break; + } + case 0x54: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 66, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + + in += 192; + to += 288; + break; + } + case 0x55: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + + in += 176; + to += 264; + break; + } + case 0x56: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + + in += 160; + to += 240; + break; + } + case 0x57: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + + in += 144; + to += 216; + break; + } + case 0x58: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + + in += 128; + to += 192; + break; + } + case 0x59: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + + in += 112; + to += 168; + break; + } + case 0x5a: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + + in += 96; + to += 144; + break; + } + case 0x5b: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + + in += 80; + to += 120; + break; + } + case 0x5c: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + + in += 64; + to += 96; + break; + } + case 0x5d: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + + in += 48; + to += 72; + break; + } + case 0x5e: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + + in += 32; + to += 48; + break; + } + case 0x5f: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + + in += 16; + to += 24; + break; + } + case 0x60: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 65, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 65 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 65 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 65 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 65 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 75, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 75 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 75 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 75 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 75 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + + in += 256; + to += 320; + break; + } + case 0x61: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 65, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 65 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 65 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 65 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 65 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + + in += 240; + to += 300; + break; + } + case 0x62: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 65, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 65 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 65 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 65 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 65 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + + in += 224; + to += 280; + break; + } + case 0x63: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + + in += 208; + to += 260; + break; + } + case 0x64: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + + in += 192; + to += 240; + break; + } + case 0x65: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + + in += 176; + to += 220; + break; + } + case 0x66: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + + in += 160; + to += 200; + break; + } + case 0x67: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + + in += 144; + to += 180; + break; + } + case 0x68: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + + in += 128; + to += 160; + break; + } + case 0x69: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + + in += 112; + to += 140; + break; + } + case 0x6a: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + + in += 96; + to += 120; + break; + } + case 0x6b: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + + in += 80; + to += 100; + break; + } + case 0x6c: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + + in += 64; + to += 80; + break; + } + case 0x6d: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + + in += 48; + to += 60; + break; + } + case 0x6e: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + + in += 32; + to += 40; + break; + } + case 0x6f: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + + in += 16; + to += 20; + break; + } + case 0x70: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); + _mm_storeu_si128((__m128i *)to + 81, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); + _mm_storeu_si128((__m128i *)to + 81 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); + _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); + _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); + _mm_storeu_si128((__m128i *)to + 99, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); + _mm_storeu_si128((__m128i *)to + 99 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24); + _mm_storeu_si128((__m128i *)to + 108, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25); + _mm_storeu_si128((__m128i *)to + 108 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26); + _mm_storeu_si128((__m128i *)to + 117, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27); + _mm_storeu_si128((__m128i *)to + 117 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 28); + _mm_storeu_si128((__m128i *)to + 126, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 126 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 126 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 126 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 29); + _mm_storeu_si128((__m128i *)to + 126 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 126 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 126 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 126 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 126 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 30); + _mm_storeu_si128((__m128i *)to + 135, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 135 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 135 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 135 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 31); + _mm_storeu_si128((__m128i *)to + 135 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 135 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 135 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 135 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 135 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + + in += 512; + to += 576; + break; + } + case 0x71: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); + _mm_storeu_si128((__m128i *)to + 81, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); + _mm_storeu_si128((__m128i *)to + 81 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); + _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); + _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); + _mm_storeu_si128((__m128i *)to + 99, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); + _mm_storeu_si128((__m128i *)to + 99 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24); + _mm_storeu_si128((__m128i *)to + 108, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25); + _mm_storeu_si128((__m128i *)to + 108 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26); + _mm_storeu_si128((__m128i *)to + 117, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27); + _mm_storeu_si128((__m128i *)to + 117 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 28); + _mm_storeu_si128((__m128i *)to + 126, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 126 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 126 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 126 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 29); + _mm_storeu_si128((__m128i *)to + 126 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 126 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 126 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 126 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 126 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + + in += 480; + to += 540; + break; + } + case 0x72: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); + _mm_storeu_si128((__m128i *)to + 81, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); + _mm_storeu_si128((__m128i *)to + 81 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); + _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); + _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); + _mm_storeu_si128((__m128i *)to + 99, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); + _mm_storeu_si128((__m128i *)to + 99 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24); + _mm_storeu_si128((__m128i *)to + 108, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25); + _mm_storeu_si128((__m128i *)to + 108 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26); + _mm_storeu_si128((__m128i *)to + 117, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27); + _mm_storeu_si128((__m128i *)to + 117 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + + in += 448; + to += 504; + break; + } + case 0x73: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); + _mm_storeu_si128((__m128i *)to + 81, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); + _mm_storeu_si128((__m128i *)to + 81 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); + _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); + _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); + _mm_storeu_si128((__m128i *)to + 99, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); + _mm_storeu_si128((__m128i *)to + 99 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24); + _mm_storeu_si128((__m128i *)to + 108, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25); + _mm_storeu_si128((__m128i *)to + 108 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + + in += 416; + to += 468; + break; + } + case 0x74: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); + _mm_storeu_si128((__m128i *)to + 81, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); + _mm_storeu_si128((__m128i *)to + 81 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); + _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); + _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); + _mm_storeu_si128((__m128i *)to + 99, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); + _mm_storeu_si128((__m128i *)to + 99 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + + in += 384; + to += 432; + break; + } + case 0x75: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); + _mm_storeu_si128((__m128i *)to + 81, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); + _mm_storeu_si128((__m128i *)to + 81 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); + _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); + _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + + in += 352; + to += 396; + break; + } + case 0x76: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); + _mm_storeu_si128((__m128i *)to + 81, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); + _mm_storeu_si128((__m128i *)to + 81 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + + in += 320; + to += 360; + break; + } + case 0x77: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + + in += 288; + to += 324; + break; + } + case 0x78: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + + in += 256; + to += 288; + break; + } + case 0x79: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + + in += 224; + to += 252; + break; + } + case 0x7a: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + + in += 192; + to += 216; + break; + } + case 0x7b: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + + in += 160; + to += 180; + break; + } + case 0x7c: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + + in += 128; + to += 144; + break; + } + case 0x7d: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + + in += 96; + to += 108; + break; + } + case 0x7e: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + + in += 64; + to += 72; + break; + } + case 0x7f: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + + in += 32; + to += 36; + break; + } + case 0x80: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 36, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 40, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 44, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 44 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 44 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 44 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 48, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 52, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 52 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 52 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 52 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 56, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 60, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + + in += 256; + to += 256; + break; + } + case 0x81: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 36, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 40, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 44, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 44 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 44 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 44 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 48, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 52, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 52 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 52 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 52 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 56, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + + in += 240; + to += 240; + break; + } + case 0x82: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 36, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 40, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 44, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 44 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 44 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 44 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 48, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 52, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 52 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 52 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 52 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + + in += 224; + to += 224; + break; + } + case 0x83: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 36, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 40, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 44, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 44 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 44 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 44 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 48, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + + in += 208; + to += 208; + break; + } + case 0x84: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 36, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 40, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 44, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 44 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 44 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 44 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + + in += 192; + to += 192; + break; + } + case 0x85: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 36, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 40, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + + in += 176; + to += 176; + break; + } + case 0x86: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 36, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + + in += 160; + to += 160; + break; + } + case 0x87: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + + in += 144; + to += 144; + break; + } + case 0x88: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + + in += 128; + to += 128; + break; + } + case 0x89: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + + in += 112; + to += 112; + break; + } + case 0x8a: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + + in += 96; + to += 96; + break; + } + case 0x8b: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + + in += 80; + to += 80; + break; + } + case 0x8c: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + + in += 64; + to += 64; + break; + } + case 0x8d: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + + in += 48; + to += 48; + break; + } + case 0x8e: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + + in += 32; + to += 32; + break; + } + case 0x8f: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + + in += 16; + to += 16; + break; + } + case 0x90: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); + _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); + _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); + _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); + _mm_storeu_si128((__m128i *)to + 77, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); + _mm_storeu_si128((__m128i *)to + 77 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24); + _mm_storeu_si128((__m128i *)to + 84, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25); + _mm_storeu_si128((__m128i *)to + 84 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26); + _mm_storeu_si128((__m128i *)to + 91, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 91 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 91 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27); + _mm_storeu_si128((__m128i *)to + 91 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 91 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 91 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 91 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 28); + _mm_storeu_si128((__m128i *)to + 98, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 98 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 98 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 29); + _mm_storeu_si128((__m128i *)to + 98 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 98 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 98 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 98 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 30); + _mm_storeu_si128((__m128i *)to + 105, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 105 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 105 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 31); + _mm_storeu_si128((__m128i *)to + 105 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 105 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 105 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 105 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + + in += 512; + to += 448; + break; + } + case 0x91: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); + _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); + _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); + _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); + _mm_storeu_si128((__m128i *)to + 77, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); + _mm_storeu_si128((__m128i *)to + 77 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24); + _mm_storeu_si128((__m128i *)to + 84, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25); + _mm_storeu_si128((__m128i *)to + 84 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26); + _mm_storeu_si128((__m128i *)to + 91, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 91 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 91 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27); + _mm_storeu_si128((__m128i *)to + 91 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 91 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 91 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 91 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 28); + _mm_storeu_si128((__m128i *)to + 98, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 98 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 98 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 29); + _mm_storeu_si128((__m128i *)to + 98 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 98 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 98 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 98 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + + in += 480; + to += 420; + break; + } + case 0x92: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); + _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); + _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); + _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); + _mm_storeu_si128((__m128i *)to + 77, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); + _mm_storeu_si128((__m128i *)to + 77 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24); + _mm_storeu_si128((__m128i *)to + 84, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25); + _mm_storeu_si128((__m128i *)to + 84 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26); + _mm_storeu_si128((__m128i *)to + 91, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 91 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 91 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27); + _mm_storeu_si128((__m128i *)to + 91 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 91 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 91 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 91 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + + in += 448; + to += 392; + break; + } + case 0x93: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); + _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); + _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); + _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); + _mm_storeu_si128((__m128i *)to + 77, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); + _mm_storeu_si128((__m128i *)to + 77 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24); + _mm_storeu_si128((__m128i *)to + 84, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25); + _mm_storeu_si128((__m128i *)to + 84 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + + in += 416; + to += 364; + break; + } + case 0x94: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); + _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); + _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); + _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); + _mm_storeu_si128((__m128i *)to + 77, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); + _mm_storeu_si128((__m128i *)to + 77 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + + in += 384; + to += 336; + break; + } + case 0x95: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); + _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); + _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); + _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + + in += 352; + to += 308; + break; + } + case 0x96: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); + _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); + _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); + _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + + in += 320; + to += 280; + break; + } + case 0x97: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); + _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + + in += 288; + to += 252; + break; + } + case 0x98: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + + in += 256; + to += 224; + break; + } + case 0x99: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + + in += 224; + to += 196; + break; + } + case 0x9a: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + + in += 192; + to += 168; + break; + } + case 0x9b: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + + in += 160; + to += 140; + break; + } + case 0x9c: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + + in += 128; + to += 112; + break; + } + case 0x9d: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + + in += 96; + to += 84; + break; + } + case 0x9e: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + + in += 64; + to += 56; + break; + } + case 0x9f: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + + in += 32; + to += 28; + break; + } + case 0xa0: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 39, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 39 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 39 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + + in += 256; + to += 192; + break; + } + case 0xa1: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 39, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 39 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 39 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + + in += 240; + to += 180; + break; + } + case 0xa2: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 39, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 39 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 39 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + + in += 224; + to += 168; + break; + } + case 0xa3: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + + in += 208; + to += 156; + break; + } + case 0xa4: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + + in += 192; + to += 144; + break; + } + case 0xa5: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + + in += 176; + to += 132; + break; + } + case 0xa6: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + + in += 160; + to += 120; + break; + } + case 0xa7: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + + in += 144; + to += 108; + break; + } + case 0xa8: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + + in += 128; + to += 96; + break; + } + case 0xa9: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + + in += 112; + to += 84; + break; + } + case 0xaa: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + + in += 96; + to += 72; + break; + } + case 0xab: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + + in += 80; + to += 60; + break; + } + case 0xac: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + + in += 64; + to += 48; + break; + } + case 0xad: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + + in += 48; + to += 36; + break; + } + case 0xae: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + + in += 32; + to += 24; + break; + } + case 0xaf: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + + in += 16; + to += 12; + break; + } + case 0xb0: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); + _mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); + _mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26); + _mm_storeu_si128((__m128i *)to + 65, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 65 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27); + _mm_storeu_si128((__m128i *)to + 65 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 65 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 65 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 28); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 29); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 30); + _mm_storeu_si128((__m128i *)to + 75, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 75 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 31); + _mm_storeu_si128((__m128i *)to + 75 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 75 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 75 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + + in += 512; + to += 320; + break; + } + case 0xb1: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); + _mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); + _mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26); + _mm_storeu_si128((__m128i *)to + 65, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 65 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27); + _mm_storeu_si128((__m128i *)to + 65 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 65 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 65 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 28); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 29); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + + in += 480; + to += 300; + break; + } + case 0xb2: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); + _mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); + _mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26); + _mm_storeu_si128((__m128i *)to + 65, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 65 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27); + _mm_storeu_si128((__m128i *)to + 65 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 65 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 65 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + + in += 448; + to += 280; + break; + } + case 0xb3: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); + _mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); + _mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + + in += 416; + to += 260; + break; + } + case 0xb4: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); + _mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); + _mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + + in += 384; + to += 240; + break; + } + case 0xb5: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + + in += 352; + to += 220; + break; + } + case 0xb6: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + + in += 320; + to += 200; + break; + } + case 0xb7: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + + in += 288; + to += 180; + break; + } + case 0xb8: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + + in += 256; + to += 160; + break; + } + case 0xb9: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + + in += 224; + to += 140; + break; + } + case 0xba: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + + in += 192; + to += 120; + break; + } + case 0xbb: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + + in += 160; + to += 100; + break; + } + case 0xbc: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + + in += 128; + to += 80; + break; + } + case 0xbd: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + + in += 96; + to += 60; + break; + } + case 0xbe: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + + in += 64; + to += 40; + break; + } + case 0xbf: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + + in += 32; + to += 20; + break; + } + case 0xc0: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 18, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 22, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 22 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 26, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 26 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 30, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + + in += 256; + to += 128; + break; + } + case 0xc1: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 18, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 22, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 22 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 26, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 26 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + + in += 240; + to += 120; + break; + } + case 0xc2: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 18, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 22, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 22 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 26, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 26 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + + in += 224; + to += 112; + break; + } + case 0xc3: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 18, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 22, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 22 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + + in += 208; + to += 104; + break; + } + case 0xc4: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 18, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 22, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 22 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + + in += 192; + to += 96; + break; + } + case 0xc5: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 18, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + + in += 176; + to += 88; + break; + } + case 0xc6: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 18, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + + in += 160; + to += 80; + break; + } + case 0xc7: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + + in += 144; + to += 72; + break; + } + case 0xc8: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + + in += 128; + to += 64; + break; + } + case 0xc9: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + + in += 112; + to += 56; + break; + } + case 0xca: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + + in += 96; + to += 48; + break; + } + case 0xcb: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + + in += 80; + to += 40; + break; + } + case 0xcc: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + + in += 64; + to += 32; + break; + } + case 0xcd: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + + in += 48; + to += 24; + break; + } + case 0xce: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + + in += 32; + to += 16; + break; + } + case 0xcf: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + + in += 16; + to += 8; + break; + } + case 0xd0: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); + _mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); + _mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26); + _mm_storeu_si128((__m128i *)to + 39, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27); + _mm_storeu_si128((__m128i *)to + 39 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 39 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 28); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 29); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 30); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 31); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + + in += 512; + to += 192; + break; + } + case 0xd1: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); + _mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); + _mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26); + _mm_storeu_si128((__m128i *)to + 39, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27); + _mm_storeu_si128((__m128i *)to + 39 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 39 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 28); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 29); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + + in += 480; + to += 180; + break; + } + case 0xd2: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); + _mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); + _mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 26); + _mm_storeu_si128((__m128i *)to + 39, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 27); + _mm_storeu_si128((__m128i *)to + 39 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 39 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + + in += 448; + to += 168; + break; + } + case 0xd3: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); + _mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); + _mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 24); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 25); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + + in += 416; + to += 156; + break; + } + case 0xd4: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 22); + _mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 23); + _mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + + in += 384; + to += 144; + break; + } + case 0xd5: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 20); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 21); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + + in += 352; + to += 132; + break; + } + case 0xd6: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 18); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 19); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + + in += 320; + to += 120; + break; + } + case 0xd7: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 16); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 17); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + + in += 288; + to += 108; + break; + } + case 0xd8: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + + in += 256; + to += 96; + break; + } + case 0xd9: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + + in += 224; + to += 84; + break; + } + case 0xda: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + + in += 192; + to += 72; + break; + } + case 0xdb: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + + in += 160; + to += 60; + break; + } + case 0xdc: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + + in += 128; + to += 48; + break; + } + case 0xdd: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + + in += 96; + to += 36; + break; + } + case 0xde: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + + in += 64; + to += 24; + break; + } + case 0xdf: + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + + in += 32; + to += 12; + break; + } + case 0xe0: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 2, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 3, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 4, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 5, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 6, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 7, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 8, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 9, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 10, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 11, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 12, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 13, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 14, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 15); + _mm_storeu_si128((__m128i *)to + 15, tmp); +} + + in += 256; + to += 64; + break; + } + case 0xe1: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 2, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 3, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 4, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 5, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 6, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 7, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 8, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 9, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 10, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 11, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 12, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 13, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 14); + _mm_storeu_si128((__m128i *)to + 14, tmp); +} + + in += 240; + to += 60; + break; + } + case 0xe2: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 2, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 3, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 4, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 5, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 6, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 7, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 8, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 9, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 10, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 11, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 12, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 13); + _mm_storeu_si128((__m128i *)to + 13, tmp); +} + + in += 224; + to += 56; + break; + } + case 0xe3: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 2, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 3, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 4, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 5, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 6, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 7, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 8, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 9, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 10, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 11, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 12); + _mm_storeu_si128((__m128i *)to + 12, tmp); +} + + in += 208; + to += 52; + break; + } + case 0xe4: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 2, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 3, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 4, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 5, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 6, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 7, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 8, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 9, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 10, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 11); + _mm_storeu_si128((__m128i *)to + 11, tmp); +} + + in += 192; + to += 48; + break; + } + case 0xe5: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 2, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 3, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 4, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 5, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 6, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 7, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 8, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 9, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 10); + _mm_storeu_si128((__m128i *)to + 10, tmp); +} + + in += 176; + to += 44; + break; + } + case 0xe6: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 2, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 3, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 4, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 5, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 6, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 7, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 8, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 9); + _mm_storeu_si128((__m128i *)to + 9, tmp); +} + + in += 160; + to += 40; + break; + } + case 0xe7: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 2, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 3, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 4, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 5, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 6, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 7, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 8); + _mm_storeu_si128((__m128i *)to + 8, tmp); +} + + in += 144; + to += 36; + break; + } + case 0xe8: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 2, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 3, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 4, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 5, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 6, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 7); + _mm_storeu_si128((__m128i *)to + 7, tmp); +} + + in += 128; + to += 32; + break; + } + case 0xe9: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 2, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 3, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 4, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 5, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 6); + _mm_storeu_si128((__m128i *)to + 6, tmp); +} + + in += 112; + to += 28; + break; + } + case 0xea: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 2, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 3, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 4, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 5); + _mm_storeu_si128((__m128i *)to + 5, tmp); +} + + in += 96; + to += 24; + break; + } + case 0xeb: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 2, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 3, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 4); + _mm_storeu_si128((__m128i *)to + 4, tmp); +} + + in += 80; + to += 20; + break; + } + case 0xec: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 2, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 3); + _mm_storeu_si128((__m128i *)to + 3, tmp); +} + + in += 64; + to += 16; + break; + } + case 0xed: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 2); + _mm_storeu_si128((__m128i *)to + 2, tmp); +} + + in += 48; + to += 12; + break; + } + case 0xee: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 1); + _mm_storeu_si128((__m128i *)to + 1, tmp); +} + + in += 32; + to += 8; + break; + } + case 0xef: + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in + 0); + _mm_storeu_si128((__m128i *)to + 0, tmp); +} + + in += 16; + to += 4; + break; + } + case 0xf0: + { + *(to + 0) = *(uint8_t *)(in + 0); + *(to + 1) = *(uint8_t *)(in + 1); + *(to + 2) = *(uint8_t *)(in + 2); + *(to + 3) = *(uint8_t *)(in + 3); + in += 4; + to += 4; + break; + } + case 0xf1: + { + *(to + 0) = *(uint8_t *)(in + 0); + *(to + 1) = *(uint8_t *)(in + 1); + *(to + 2) = *(uint8_t *)(in + 2); + in += 3; + to += 3; + break; + } + case 0xf2: + { + *(to + 0) = *(uint8_t *)(in + 0); + *(to + 1) = *(uint8_t *)(in + 1); + in += 2; + to += 2; + break; + } + case 0xf3: + { + *(to + 0) = *(uint8_t *)(in + 0); + in += 1; + to += 1; + break; + } + case 0xf4: + { + *(to + 0) = *(uint16_t *)(in + 2 * 0); + *(to + 1) = *(uint16_t *)(in + 2 * 1); + *(to + 2) = *(uint16_t *)(in + 2 * 2); + *(to + 3) = *(uint16_t *)(in + 2 * 3); + in += 2 * 4; + to += 4; + break; + } + case 0xf5: + { + *(to + 0) = *(uint16_t *)(in + 2 * 0); + *(to + 1) = *(uint16_t *)(in + 2 * 1); + *(to + 2) = *(uint16_t *)(in + 2 * 2); + in += 2 * 3; + to += 3; + break; + } + case 0xf6: + { + *(to + 0) = *(uint16_t *)(in + 2 * 0); + *(to + 1) = *(uint16_t *)(in + 2 * 1); + in += 2 * 2; + to += 2; + break; + } + case 0xf7: + { + *(to + 0) = *(uint16_t *)(in + 2 * 0); + in += 2 * 1; + to += 1; + break; + } + case 0xf8: + { + *(to + 0) = (*(uint8_t *)(in + 3 * 0) << 16) | (*(uint8_t *)(in + 3 * 0 + 1) << 8) | (*(uint8_t *)(in + 3 * 0 + 2)); + *(to + 1) = (*(uint8_t *)(in + 3 * 1) << 16) | (*(uint8_t *)(in + 3 * 1 + 1) << 8) | (*(uint8_t *)(in + 3 * 1 + 2)); + *(to + 2) = (*(uint8_t *)(in + 3 * 2) << 16) | (*(uint8_t *)(in + 3 * 2 + 1) << 8) | (*(uint8_t *)(in + 3 * 2 + 2)); + *(to + 3) = (*(uint8_t *)(in + 3 * 3) << 16) | (*(uint8_t *)(in + 3 * 3 + 1) << 8) | (*(uint8_t *)(in + 3 * 3 + 2)); + in += 3 * 4; + to += 4; + break; + } + case 0xf9: + { + *(to + 0) = (*(uint8_t *)(in + 3 * 0) << 16) | (*(uint8_t *)(in + 3 * 0 + 1) << 8) | (*(uint8_t *)(in + 3 * 0 + 2)); + *(to + 1) = (*(uint8_t *)(in + 3 * 1) << 16) | (*(uint8_t *)(in + 3 * 1 + 1) << 8) | (*(uint8_t *)(in + 3 * 1 + 2)); + *(to + 2) = (*(uint8_t *)(in + 3 * 2) << 16) | (*(uint8_t *)(in + 3 * 2 + 1) << 8) | (*(uint8_t *)(in + 3 * 2 + 2)); + in += 3 * 3; + to += 3; + break; + } + case 0xfa: + { + *(to + 0) = (*(uint8_t *)(in + 3 * 0) << 16) | (*(uint8_t *)(in + 3 * 0 + 1) << 8) | (*(uint8_t *)(in + 3 * 0 + 2)); + *(to + 1) = (*(uint8_t *)(in + 3 * 1) << 16) | (*(uint8_t *)(in + 3 * 1 + 1) << 8) | (*(uint8_t *)(in + 3 * 1 + 2)); + in += 3 * 2; + to += 2; + break; + } + case 0xfb: + { + *(to + 0) = (*(uint8_t *)(in + 3 * 0) << 16) | (*(uint8_t *)(in + 3 * 0 + 1) << 8) | (*(uint8_t *)(in + 3 * 0 + 2)); + in += 3 * 1; + to += 1; + break; + } + case 0xfc: + { + *(to + 0) = *(uint32_t *)(in + 4 * 0); + *(to + 1) = *(uint32_t *)(in + 4 * 1); + *(to + 2) = *(uint32_t *)(in + 4 * 2); + *(to + 3) = *(uint32_t *)(in + 4 * 3); + in += 4 * 4; + to += 4; + break; + } + case 0xfd: + { + *(to + 0) = *(uint32_t *)(in + 4 * 0); + *(to + 1) = *(uint32_t *)(in + 4 * 1); + *(to + 2) = *(uint32_t *)(in + 4 * 2); + in += 4 * 3; + to += 3; + break; + } + case 0xfe: + { + *(to + 0) = *(uint32_t *)(in + 4 * 0); + *(to + 1) = *(uint32_t *)(in + 4 * 1); + in += 4 * 2; + to += 2; + break; + } + case 0xff: + { + *(to + 0) = *(uint32_t *)(in + 4 * 0); + in += 4 * 1; + to += 1; + break; + } + } + } +} diff --git a/ext/bench_/bench/compress_qmx_v4.cpp b/ext/bench_/bench/compress_qmx_v4.cpp new file mode 100644 index 0000000..c9ee580 --- /dev/null +++ b/ext/bench_/bench/compress_qmx_v4.cpp @@ -0,0 +1,1527 @@ +/* + ANT_COMPRESS_QMX_V4.CPP + ------------------- + Copyright (c) 2014 by Andrew Trotman + Licensed BSD + + A version of BinPacking where we pack into a 128-bit SSE register the following: + 256 0-bit words + 128 1-bit words + 64 2-bit words + 40 3-bit words + 32 4-bit words + 24 5-bit words + 20 6-bit words + 16 8-bit words + 12 10-bit words + 8 16-bit words + 4 32-bit words + or pack into two 128-bit words (i.e. 256 bits) the following: + 36 7-bit words + 28 9-bit words + 20 12-bit words + 12 21-bit words + + This gives us 15 possible combinations. The combinaton is stored in the top 4 bits of a selector byte. The + bottom 4-bits of the selector store a run-length (the number of such sequences seen in a row. + + The 128-bit (or 256-bit) packed binary values are stored first. Then we store the selectors, Finally, + stored variable byte encoded, is a pointer to the start of the selector (from the end of the sequence). + + This way, all reads and writes are 128-bit word aligned, except addressing the selector (and the pointer + the selector). These reads are byte aligned. + + Note: There is currently 1 unused encoding (i.e. 16 unused selecvtor values). These might in the future be + used for encoding exceptions, much as PForDelta does. +*/ +#include +#include +#include +#include +#include +#include "compress_qmx_v4.h" + +//#define MAKE_DECOMPRESS 1 /* uncomment this and it will create a program that writes the decompressor */ +//#define TEST_ONE_STRING 1 /* Uncomment this and it will create a program that can be used to test the compressor and decompressor */ +#define NO_ZEROS 1 /* stores runs of 256 1s in a row (not 1-bit number, but actual 1 values). */ +#define SHORT_END_BLOCKS 1 + +#ifdef _MSC_VER + #define ALIGN_16 __declspec(align(16)) +#else + #define ALIGN_16 __attribute__ ((aligned (16))) +#endif + +//#define STATS /* uncomment this and it will count the selector usage */ +#ifdef STATS + static uint32_t stats[65] = {0}; +#endif + +/* + ANT_ANT_COMPRESS_QMX_V4::ANT_ANT_COMPRESS_QMX_V4() + ------------------------------------ +*/ +ANT_compress_qmx_v4::ANT_compress_qmx_v4() +{ +length_buffer = NULL; +length_buffer_length = 0; +} + +/* + ANT_ANT_COMPRESS_QMX_V4::~ANT_ANT_COMPRESS_QMX_V4() + ------------------------------------- +*/ +ANT_compress_qmx_v4::~ANT_compress_qmx_v4() +{ +delete [] length_buffer; +#ifdef STATS + uint32_t which; + for (which = 0; which <= 32; which++) + if (stats[which] != 0) + printf("%d\t%d\ttimes\n", which, stats[which]); +#endif +} + +/* + BYTES_NEEDED_FOR() + ------------------ +*/ +static uint8_t bytes_needed_for(uint32_t value) +{ +if (value <= 0xFF) + return 1; +else if (value <= 0xFFFF) + return 2; +else if (value <= 0xFFFFFF) + return 3; +else + return 4; +} + +/* + BITS_NEEDED_FOR() + ----------------- +*/ +static uint8_t bits_needed_for(uint32_t value) +{ +if (value == 0x01) + return 0; +else if (value <= 0x01) + return 1; +else if (value <= 0x03) + return 2; +else if (value <= 0x07) + return 3; +else if (value <= 0x0F) + return 4; +else if (value <= 0x1F) + return 5; +else if (value <= 0x3F) + return 6; +else if (value <= 0x7F) + return 7; +else if (value <= 0xFF) + return 8; +else if (value <= 0x1FF) + return 9; +else if (value <= 0x3FF) + return 10; +else if (value <= 0xFFF) + return 12; +else if (value <= 0xFFFF) + return 16; +else if (value <= 0x1FFFFF) + return 21; +else + return 32; +} + + +/* + WRITE_OUT() + ----------- +*/ +static void write_out(uint8_t **buffer, uint32_t *source, uint32_t raw_count, uint32_t size_in_bits, uint8_t **length_buffer) +{ +uint32_t current, batch; +uint8_t *destination = *buffer; +uint32_t *end = source + raw_count; +uint8_t *key_store = *length_buffer; +uint32_t ALIGN_16 sequence_buffer[4]; +uint32_t instance, value; +uint8_t type; +uint32_t count; + +uint32_t max_bytes = 1; // this is the bytw-width for type128 encoded non-SSE integers + +#ifdef STATS + stats[size_in_bits] += raw_count; +#endif + +if (size_in_bits == 0) + { + type = 0; + count = (raw_count + 255) / 256; + } +else if (size_in_bits == 1) + { + type = 1; // 1 bit per integer + count = (raw_count + 127) / 128; + } +else if (size_in_bits == 2) + { + type = 2; // 2 bits per integer + count = (raw_count + 63) / 64; + } +else if (size_in_bits == 3) + { + type = 3; // 3 bits per integer + count = (raw_count + 39) / 40; + } +else if (size_in_bits == 4) + { + type = 4; // 4 bits per integer + count = (raw_count + 31) / 32; + } +else if (size_in_bits == 5) + { + type = 5; // 5 bits per integer + count = (raw_count + 23) / 24; + } +else if (size_in_bits == 6) + { + type = 6; // 6 bits per integer + count = (raw_count + 19) / 20; + } +else if (size_in_bits == 7) + { + type = 7; // 7 bits per integer, 18 integers per read (but requires 2 reads) + count = (raw_count + 35) / 36; + } +else if (size_in_bits == 8) + { + type = 8; // 8 bits per integer + count = (raw_count + 15) / 16; + } +else if (size_in_bits == 9) + { + type = 9; // 9 bits per integer, 14 integers per read (but requires 2 reads) + count = (raw_count + 27) / 28; + } +else if (size_in_bits == 10) + { + type = 10; // 10 bits per integer + count = (raw_count + 11) / 12; + } +else if (size_in_bits == 12) + { + type = 11; // 12 bits per integer, 10 integers per read (but requires 2 reads) + count = (raw_count + 19) / 20; + } +else if (size_in_bits == 16) + { + type = 12; // 16 bits per integer + count = (raw_count + 7) / 8; + } +else if (size_in_bits == 21) + { + type = 13; // 21 bits per integer, 6 integers per read (but requires 2 reads) + count = (raw_count + 11) / 12; + } +else if (size_in_bits == 32) + { + type = 14; // 32 bits per integer + count = (raw_count + 3) / 4; + } +else if (size_in_bits == 128) + { + type = 15; + count = raw_count; + /* + As the count for type 128 can only be 1, 2, or 3, we can re-appropriate it and store the bit-length in there too. + */ + max_bytes = 1; + for (uint32_t integer = 0; integer < count; integer++) + { + if (bytes_needed_for(source[integer]) > max_bytes) + max_bytes = bytes_needed_for(source[integer]); + } + } +else + exit(printf("Can't compress into integers of size %dbits\n", size_in_bits)); + +while (count > 0) + { + batch = count > 16 ? 16 : count; + *key_store++ = (type << 4) | (~(batch - 1) & 0x0F); + + count -= batch; + + for (current = 0; current < batch; current++) + { + switch (size_in_bits) + { + case 0: // 0 bits per integer (i.e. a long sequence of zeros) + /* + In this case we don't need to store a 4 byte integer because its implicit + */ + source += 256; + break; + case 1: // 1 bit per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 128; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 1); + + memcpy(destination, sequence_buffer, 16); + destination += 16; + source += 128; + break; + case 2: // 2 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 64; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 2); + + memcpy(destination, sequence_buffer, 16); + destination += 16; + source += 64; + break; + case 3: // 3 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 40; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 3); + + memcpy(destination, sequence_buffer, 16); + destination += 16; + source += 40; + break; + case 4: // 4 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 32; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 4); + + memcpy(destination, sequence_buffer, 16); + destination += 16; + source += 32; + break; + case 5: // 5 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 24; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 5); + + memcpy(destination, sequence_buffer, 16); + destination += 16; + source += 24; + break; + case 6: // 6 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 20; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 6); + memcpy(destination, sequence_buffer, 16); + destination += 16; + source += 20; + break; + case 7: // 7 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 20; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 7); + memcpy(destination, sequence_buffer, 16); + destination += 16; + + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 16; value < 20; value++) + sequence_buffer[value & 0x03] |= source[value] >> 4; + for (value = 20; value < 36; value++) + sequence_buffer[value & 0x03] |= source[value] << (((value - 20) / 4) * 7 + 3); + memcpy(destination, sequence_buffer, 16); + + destination += 16; + source += 36; // 36 in a double 128-bit word + break; + case 8: // 8 bits per integer +#ifdef SHORT_END_BLOCKS + for (instance = 0; instance < 16 && source < end; instance++) +#else + for (instance = 0; instance < 16; instance++) +#endif + *destination++ = (uint8_t)*source++; + break; + case 9: // 9 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 16; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 9); + memcpy(destination, sequence_buffer, 16); + destination += 16; + + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 12; value < 16; value++) + sequence_buffer[value & 0x03] |= source[value] >> 5; + for (value = 16; value < 28; value++) + sequence_buffer[value & 0x03] |= source[value] << (((value - 16) / 4) * 9 + 4); + memcpy(destination, sequence_buffer, 16); + + destination += 16; + source += 28; // 28 in a double 128-bit word + break; + case 10: // 10 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 12; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 10); + + memcpy(destination, sequence_buffer, 16); + destination += 16; + source += 12; + break; + case 12: // 12 bit integers + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 12; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 12); + memcpy(destination, sequence_buffer, 16); + destination += 16; + + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 8; value < 12; value++) + sequence_buffer[value & 0x03] |= source[value] >> 8; + for (value = 12; value < 20; value++) + sequence_buffer[value & 0x03] |= source[value] << (((value - 12) / 4) * 12 + 8); + memcpy(destination, sequence_buffer, 16); + + destination += 16; + source += 20; // 20 in a double 128-bit word + break; + case 16: // 16 bits per integer +#ifdef SHORT_END_BLOCKS + for (instance = 0; instance < 8 && source < end; instance++) +#else + for (instance = 0; instance < 8; instance++) +#endif + { + *(uint16_t *)destination = (uint16_t)*source++; + destination += 2; + } + break; + case 21: // 21 bits per integer + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 0; value < 8; value++) + sequence_buffer[value & 0x03] |= source[value] << ((value / 4) * 21); + memcpy(destination, sequence_buffer, 16); + destination += 16; + + memset(sequence_buffer, 0, sizeof(sequence_buffer)); + for (value = 4; value < 8; value++) + sequence_buffer[value & 0x03] |= source[value] >> 11; + for (value = 8; value < 12; value++) + sequence_buffer[value & 0x03] |= source[value] << (((value - 8) / 4) * 21 + 11); + memcpy(destination, sequence_buffer, 16); + + destination += 16; + source += 12; // 12 in a double 128-bit word + break; + case 32: // 32 bits per integer +#ifdef SHORT_END_BLOCKS + for (instance = 0; instance < 4 && source < end; instance++) +#else + for (instance = 0; instance < 4; instance++) +#endif + { + *(uint32_t *)destination = (uint32_t)*source++; + destination += 4; + } + break; + case 128: + if (max_bytes == 1) + { + *(uint8_t *)destination = (uint8_t)*source; + source++; + destination += 1; + *(key_store - 1) = (type << 4) | (~(batch - 1) & 0x03); + } + else if (max_bytes == 2) + { + *(uint16_t *)destination = (uint16_t)*source; + source++; + destination += 2; + *(key_store - 1) = (type << 4) | 4 | (~(batch - 1) & 0x03); + } + else if (max_bytes == 3) + { + *destination++ = (uint8_t)((*source >> 16) & 0xFF); + *destination++ = (uint8_t)((*source >> 8) & 0xFF); + *destination++ = (uint8_t)((*source >> 0) & 0xFF); + source++; + + *(key_store - 1) = (type << 4) | 8 | (~(batch - 1) & 0x03); + } + else if (max_bytes == 4) + { + *(uint32_t *)destination = (uint32_t)*source; + source++; + destination += 4; + *(key_store - 1) = (type << 4) | 0x0C | (~(batch - 1) & 0x03); + } + else + printf("max_bytes must be 1, 2, 3, or 4, but is:%d", (int)max_bytes); + break; + } + } + } +*buffer = destination; +*length_buffer = key_store; +} + +/* + MAX() + ----- +*/ +template +T max(T a, T b) +{ +return a > b ? a : b; +} + +/* + MAX() + ----- +*/ +template +T max(T a, T b, T c, T d) +{ +return max(max(a, b), max(c, d)); +} + +/* + ANT_ANT_COMPRESS_QMX_V4::ENCODEARRAY() + ------------------------------- +*/ +void ANT_compress_qmx_v4::encodeArray(const uint32_t *source, uint64_t source_integers, uint32_t *into, uint64_t *nvalue) +{ +const uint32_t WASTAGE = 512; +uint8_t *current_length, *destination = (uint8_t *)into, *keys; +uint32_t *current, run_length, bits, new_needed, wastage; +uint32_t block, largest; + +/* + make sure we have enough room to store the lengths +*/ +if (length_buffer_length < source_integers) + { + delete [] length_buffer; + length_buffer = new uint8_t [(size_t)(length_buffer_length = source_integers) + WASTAGE]; + } + +/* + Get the lengths of the integers +*/ +current_length = length_buffer; +for (current = (uint32_t *)source; current < source + source_integers; current++) + *current_length++ = bits_needed_for(*current); + +/* + Shove a bunch of 0 length integers on the end to allow for overflow +*/ +for (wastage = 0; wastage < WASTAGE; wastage++) + *current_length++ = 0; + +/* + Process the lengths. To maximise SSE throughput we need each write to be 128-bit (4*32-bit) alignned + and therefore we need each compress "block" to be the same size where a compress "block" is a set of + four encoded integers starting on a 4-integer boundary. +*/ +for (current_length = length_buffer; current_length < length_buffer + source_integers + 4; current_length += 4) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = max(*current_length, *(current_length + 1), *(current_length + 2), *(current_length + 3)); + +/* + This code makes sure we can do aligned reads, promoting to larger integers if necessary +*/ +current_length = length_buffer; +while (current_length < length_buffer + source_integers) + { +#ifdef SHORT_END_BLOCKS + /* + If there are fewer than 16 values remaining and they all fit into 8-bits then its smaller than storing stripes + If there are fewer than 8 values remaining and they all fit into 16-bits then its smaller than storing stripes + If there are fewer than 4 values remaining and they all fit into 32-bits then its smaller than storing stripes + */ + if (source_integers - (current_length - length_buffer) < 4) + { + largest = 0; + for (block = 0; block < 8; block++) + largest = max((uint8_t)largest, *(current_length + block)); + if (largest <= 8) + for (block = 0; block < 8; block++) + *(current_length + block) = 8; + else if (largest <= 16) + for (block = 0; block < 8; block++) + *(current_length + block) = 16; + else if (largest <= 32) + for (block = 0; block < 8; block++) + *(current_length + block) = 32; + } + else if (source_integers - (current_length - length_buffer) < 8) + { + largest = 0; + for (block = 0; block < 8; block++) + largest = max((uint8_t)largest, *(current_length + block)); + if (largest <= 8) + for (block = 0; block < 8; block++) + *(current_length + block) = 8; + else if (largest <= 8) + for (block = 0; block < 8; block++) + *(current_length + block) = 16; + } + else if (source_integers - (current_length - length_buffer) < 16) + { + largest = 0; + for (block = 0; block < 16; block++) + largest = max((uint8_t)largest, *(current_length + block)); + if (largest <= 8) + for (block = 0; block < 16; block++) + *(current_length + block) = 8; + } + /* + Otherwise we have the standard rules for a block + */ +#endif + /* + Two things need to happen to be able to use a particular selector. The first is that all the + values that would end up in that block need to use at most the bit value of that block. + The second is that there need to be at least as many numbers remaining as the block encodes. + + For example, if the current block only needs 0-bits per int, then check that the 256 values + that would be encoded only take 0-bits. If any value needs more, or there aren't 256 numbers remaining, + then promote the current block to try encode 128 1-bit values. + */ + switch (*current_length) + { + case 0: + if (source_integers - (current_length - length_buffer) < 256) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 1; // promote + break; + } + for (block = 0; block < 256; block += 4) + if (*(current_length + block) > 0) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 1; // promote + if (*current_length == 0) + { + for (block = 0; block < 256; block++) + current_length[block] = 0; + current_length += 256; + } + break; + case 1: + if (source_integers - (current_length - length_buffer) < 128) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 2; // promote + break; + } + for (block = 0; block < 128; block += 4) + if (*(current_length + block) > 1) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 2; // promote + if (*current_length == 1) + { + for (block = 0; block < 128; block++) + current_length[block] = 1; + current_length += 128; + } + break; + case 2: + if (source_integers - (current_length - length_buffer) < 64) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 3; // promote + break; + } + for (block = 0; block < 64; block += 4) + if (*(current_length + block) > 2) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 3; // promote + if (*current_length == 2) + { + for (block = 0; block < 64; block++) + current_length[block] = 2; + current_length += 64; + } + break; + case 3: + if (source_integers - (current_length - length_buffer) < 40) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 4; // promote + break; + } + for (block = 0; block < 40; block += 4) + if (*(current_length + block) > 3) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 4; // promote + if (*current_length == 3) + { + for (block = 0; block < 40; block++) + current_length[block] = 3; + current_length += 40; + } + break; + case 4: + if (source_integers - (current_length - length_buffer) < 32) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 5; // promote + break; + } + for (block = 0; block < 32; block += 4) + if (*(current_length + block) > 4) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 5; // promote + if (*current_length == 4) + { + for (block = 0; block < 32; block++) + current_length[block] = 4; + current_length += 32; + } + break; + case 5: + if (source_integers - (current_length - length_buffer) < 24) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 6; // promote + break; + } + for (block = 0; block < 24; block += 4) + if (*(current_length + block) > 5) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 6; // promote + if (*current_length == 5) + { + for (block = 0; block < 24; block++) + current_length[block] = 5; + current_length += 24; + } + break; + case 6: + if (source_integers - (current_length - length_buffer) < 20) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 7; // promote + break; + } + for (block = 0; block < 20; block += 4) + if (*(current_length + block) > 6) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 7; // promote + if (*current_length == 6) + { + for (block = 0; block < 20; block++) + current_length[block] = 6; + current_length += 20; + } + break; + case 7: + if (source_integers - (current_length - length_buffer) < 36) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 8; // promote + break; + } + for (block = 0; block < 36; block += 4) // 36 in a double 128-bit word + if (*(current_length + block) > 7) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 8; // promote + if (*current_length == 7) + { + for (block = 0; block < 36; block++) + current_length[block] = 7; + current_length += 36; + } + break; + case 8: + if (source_integers - (current_length - length_buffer) < 16) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 9; // promote + break; + } + for (block = 0; block < 16; block += 4) + if (*(current_length + block) > 8) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 9; // promote + if (*current_length == 8) + { + for (block = 0; block < 16; block++) + current_length[block] = 8; + current_length += 16; + } + break; + case 9: + if (source_integers - (current_length - length_buffer) < 28) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 10; // promote + break; + } + for (block = 0; block < 28; block += 4) // 28 in a double 128-bit word + if (*(current_length + block) > 9) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 10; // promote + if (*current_length == 9) + { + for (block = 0; block < 28; block++) + current_length[block] = 9; + current_length += 28; + } + break; + case 10: + if (source_integers - (current_length - length_buffer) < 12) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 12; // promote + break; + } + for (block = 0; block < 12; block += 4) + if (*(current_length + block) > 10) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 12; // promote + if (*current_length == 10) + { + for (block = 0; block < 12; block++) + current_length[block] = 10; + current_length += 12; + } + break; + case 12: + if (source_integers - (current_length - length_buffer) < 20) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 16; // promote + break; + } + for (block = 0; block < 20; block += 4) // 20 in a double 128-bit word + if (*(current_length + block) > 12) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 16; // promote + if (*current_length == 12) + { + for (block = 0; block < 20; block++) + current_length[block] = 12; + current_length += 20; + } + break; + case 16: + if (source_integers - (current_length - length_buffer) < 8) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 21; // promote + break; + } + for (block = 0; block < 8; block += 4) + if (*(current_length + block) > 16) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 21; // promote + if (*current_length == 16) + { + for (block = 0; block < 8; block++) + current_length[block] = 16; + current_length += 8; + } + break; + case 21: + if (source_integers - (current_length - length_buffer) < 12) + { + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 32; // promote + break; + } + for (block = 0; block < 12; block += 4) // 12 in a double 128-bit word + if (*(current_length + block) > 21) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 32; // promote + if (*current_length == 21) + { + for (block = 0; block < 12; block++) + current_length[block] = 21; + current_length += 12; + } + break; + case 32: + if (source_integers - (current_length - length_buffer) < 4) + { + for (block = 0; block < (source_integers - (current_length - length_buffer)); block++) + *(current_length + block) = 128; // promote + break; + } + for (block = 0; block < 4; block += 4) + if (*(current_length + block) > 32) + *current_length = *(current_length + 1) = *(current_length + 2) = *(current_length + 3) = 64; // promote + if (*current_length == 32) + { + for (block = 0; block < 4; block++) + current_length[block] = 32; + current_length += 4; + } + break; + case 128: + /* + The 128-bit selector is used as a last resort when there are not enough numbers to use an + earlier selector. So don't worry about checking the rest. + */ + current_length += source_integers - (current_length - length_buffer); + break; + default: + exit(printf("Selecting on a non whole power of 2, must exit\n")); + break; + } + } + +/* + We can now compress based on the lengths in length_buffer +*/ +run_length = 1; +bits = length_buffer[0]; +keys = length_buffer; // we're going to re-use the length_buffer because it can't overlap and this saves a double malloc +for (current = (uint32_t *)source + 1; current < source + source_integers; current++) + { + new_needed = length_buffer[current - source]; + if (new_needed == bits) + run_length++; + else + { + write_out(&destination, (uint32_t *)current - run_length, run_length, bits, &keys); + bits = new_needed; + run_length = 1; + } + } +write_out(&destination, (uint32_t *)current - run_length, run_length, bits, &keys); + +/* + Copy the lengths to the end, backwards +*/ +uint8_t *from = length_buffer + (keys - length_buffer) - 1; +uint8_t *to = destination; +for (uint32_t pos = 0; pos < keys - length_buffer; pos++) + *to++ = *from--; +destination += keys - length_buffer; + +/* + Compute the length (in bytes) +*/ +*nvalue = destination - (uint8_t *)into; // return length in bytes +} + +#ifdef MAKE_DECOMPRESS + /* + The following program generates the source code for ANT_compress_qmx_v4::decodeArray() + */ + /* + MAIN() + ------ + This version assumes SSE4.1 and so it is *not* portable to non X86 architectures + */ + int main(void) + { + uint32_t instance; + + printf("static uint32_t ALIGN_16 static_mask_21[] = {0x1fffff, 0x1fffff, 0x1fffff, 0x1fffff};\n"); + printf("static uint32_t ALIGN_16 static_mask_12[] = {0xfff, 0xfff, 0xfff, 0xfff};\n"); + printf("static uint32_t ALIGN_16 static_mask_10[] = {0x3ff, 0x3ff, 0x3ff, 0x3ff};\n"); + printf("static uint32_t ALIGN_16 static_mask_9[] = {0x1ff, 0x1ff, 0x1ff, 0x1ff};\n"); + printf("static uint32_t ALIGN_16 static_mask_7[] = {0x7f, 0x7f, 0x7f, 0x7f};\n"); + printf("static uint32_t ALIGN_16 static_mask_6[] = {0x3f, 0x3f, 0x3f, 0x3f};\n"); + printf("static uint32_t ALIGN_16 static_mask_5[] = {0x1f, 0x1f, 0x1f, 0x1f};\n"); + printf("static uint32_t ALIGN_16 static_mask_4[] = {0x0f, 0x0f, 0x0f, 0x0f};\n"); + printf("static uint32_t ALIGN_16 static_mask_3[] = {0x07, 0x07, 0x07, 0x07};\n"); + printf("static uint32_t ALIGN_16 static_mask_2[] = {0x03, 0x03, 0x03, 0x03};\n"); + printf("static uint32_t ALIGN_16 static_mask_1[] = {0x01, 0x01, 0x01, 0x01};\n"); + printf("void ANT_compress_qmx_v4::decodeArray(const uint32_t *source, uint64_t len, uint32_t *to, uint64_t destination_integers)\n"); + printf("{\n"); + printf("__m128i mask_21, mask_12, mask_10, mask_9, mask_7, mask_6, mask_5, mask_4, mask_3, mask_2, mask_1;\n"); + printf("uint8_t *in = (uint8_t *)source;\n"); + printf("uint8_t *keys = ((uint8_t *)source) + len - 1;\n"); + + printf("\n"); + printf("mask_21 = _mm_loadu_si128((__m128i *)static_mask_21);\n"); + printf("mask_12 = _mm_loadu_si128((__m128i *)static_mask_12);\n"); + printf("mask_10 = _mm_loadu_si128((__m128i *)static_mask_10);\n"); + printf("mask_9 = _mm_loadu_si128((__m128i *)static_mask_9);\n"); + printf("mask_7 = _mm_loadu_si128((__m128i *)static_mask_7);\n"); + printf("mask_6 = _mm_loadu_si128((__m128i *)static_mask_6);\n"); + printf("mask_5 = _mm_loadu_si128((__m128i *)static_mask_5);\n"); + printf("mask_4 = _mm_loadu_si128((__m128i *)static_mask_4);\n"); + printf("mask_3 = _mm_loadu_si128((__m128i *)static_mask_3);\n"); + printf("mask_2 = _mm_loadu_si128((__m128i *)static_mask_2);\n"); + printf("mask_1 = _mm_loadu_si128((__m128i *)static_mask_1);\n"); + printf("\n"); + + printf("while (in <= keys) // <= because there can be a boundary case where the final key is 255*0 bit integers\n"); + printf("\t{\n"); + printf("\tswitch (*keys--)\n"); + printf("\t\t{\n"); + + for (instance = 0; instance <= 0xFF; instance++) + { + printf("\t\tcase 0x%02x:\n", instance); + printf("\t\t\t_mm_prefetch(keys, _MM_HINT_NTA);\n"); + printf("\t\t\t{\n"); + if ((instance >> 4) == 0) + { + /* + 256 0-bit integers + */ + printf("#ifdef NO_ZEROS\n"); + printf("\t\t\tconst __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1);\n"); + printf("#else\n"); + printf("\t\t\tconst __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));\n"); + printf("#endif\n"); + + for (uint32_t run = 0; run < 0x10 - (instance & 0x0F); run++) + { + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 1, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 2, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 3, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 4, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 5, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 6, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 7, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 8, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 9, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 10, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 11, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 12, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 13, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 14, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 15, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 16, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 17, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 18, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 19, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 20, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 21, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 22, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 23, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 24, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 25, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 26, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 27, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 28, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 29, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 30, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 31, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 32, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 33, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 34, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 35, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 36, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 37, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 38, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 39, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 40, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 41, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 42, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 43, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 44, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 45, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 46, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 47, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 48, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 49, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 50, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 51, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 52, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 53, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 54, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 55, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 56, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 57, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 58, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 59, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 60, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 61, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 62, tmp);\n", run * 64); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 63, tmp);\n", run * 64); + printf("\n"); + } + printf("\t\t\tto += %d;\n", 256 * (0x10 - (instance & 0x0F))); // becomes 256 integers + printf("\t\t\tbreak;\n"); + } + else if (instance >> 4 == 1) + { + /* + 128 * 1-bit integers + */ + for (uint32_t run = 0; run < 0x10 - (instance & 0x0F); run++) + { + printf("{\n"); + printf("\t\t\tconst __m128i byte_stream = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_prefetch(in += 16, _MM_HINT_NTA);\n"); + + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d, _mm_and_si128(byte_stream, mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1));\n", run * 32); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1));\n", run * 32); + printf("}\n"); + printf("\n"); + } + printf("\t\t\tto += %d;\n", 128 * (0x10 - (instance & 0x0F))); // becomes 128 integers + printf("\t\t\tbreak;\n"); + } + else if (instance >> 4 == 2) + { + /* + 64 * 2-bit integers + */ + for (uint32_t run = 0; run < 0x10 - (instance & 0x0F); run++) + { + printf("{\n"); + + printf("\t\t\tconst __m128i byte_stream = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_prefetch(in += 16, _MM_HINT_NTA);\n"); + + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d, _mm_and_si128(byte_stream, mask_2));\n", run * 16); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2));\n", run * 16); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2));\n", run * 16); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2));\n", run * 16); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2));\n", run * 16); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2));\n", run * 16); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2));\n", run * 16); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2));\n", run * 16); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2));\n", run * 16); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2));\n", run * 16); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2));\n", run * 16); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2));\n", run * 16); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2));\n", run * 16); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2));\n", run * 16); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2));\n", run * 16); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2));\n", run * 16); + printf("}\n"); + printf("\n"); + } + + printf("\t\t\tto += %d;\n", 64 * (0x10 - (instance & 0x0F))); // becomes 64 integers + printf("\t\t\tbreak;\n"); + } + else if (instance >> 4 == 3) + { + /* + 40 * 3-bit integers + */ + for (uint32_t run = 0; run < 0x10 - (instance & 0x0F); run++) + { + printf("{\n"); + printf("\t\t\tconst __m128i byte_stream = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_prefetch(in += 16, _MM_HINT_NTA);\n"); + + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d, _mm_and_si128(byte_stream, mask_3));\n", run * 10); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3));\n", run * 10); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3));\n", run * 10); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3));\n", run * 10); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3));\n", run * 10); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3));\n", run * 10); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3));\n", run * 10); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3));\n", run * 10); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3));\n", run * 10); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3));\n", run * 10); + printf("}\n"); + printf("\n"); + } + + printf("\t\t\tto += %d;\n", 40 * (0x10 - (instance & 0x0F))); // becomes 40 integers + printf("\t\t\tbreak;\n"); + } + else if (instance >> 4 == 4) + { + /* + 32 * 4-bit integers + */ + for (uint32_t run = 0; run < 0x10 - (instance & 0x0F); run++) + { + printf("{\n"); + printf("\t\t\tconst __m128i byte_stream = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_prefetch(in += 16, _MM_HINT_NTA);\n"); + + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d, _mm_and_si128(byte_stream, mask_4));\n", run * 8); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4));\n", run * 8); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4));\n", run * 8); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4));\n", run * 8); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4));\n", run * 8); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4));\n", run * 8); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4));\n", run * 8); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4));\n", run * 8); + printf("}\n"); + + printf("\n"); + } + + printf("\t\t\tto += %d;\n", 32 * (0x10 - (instance & 0x0F))); // becomes 32 integers + printf("\t\t\tbreak;\n"); + } + else if (instance >> 4 == 5) + { + /* + 24 * 5-bit integers + */ + for (uint32_t run = 0; run < 0x10 - (instance & 0x0F); run++) + { + printf("{\n"); + printf("\t\t\tconst __m128i byte_stream = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_prefetch(in += 16, _MM_HINT_NTA);\n"); + + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d, _mm_and_si128(byte_stream, mask_5));\n", run * 6); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5));\n", run * 6); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5));\n", run * 6); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5));\n", run * 6); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5));\n", run * 6); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5));\n", run * 6); + printf("}\n"); + printf("\n"); + } + printf("\t\t\tto += %d;\n", 24 * (0x10 - (instance & 0x0F))); // becomes 24 integers + printf("\t\t\tbreak;\n"); + } + else if (instance >> 4 == 6) + { + /* + 20 * 6-bit integers + */ + for (uint32_t run = 0; run < 0x10 - (instance & 0x0F); run++) + { + printf("{\n"); + printf("\t\t\tconst __m128i byte_stream = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_prefetch(in += 16, _MM_HINT_NTA);\n"); + + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d, _mm_and_si128(byte_stream, mask_6));\n", run * 5); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6));\n", run * 5); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6));\n", run * 5); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6));\n", run * 5); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6));\n", run * 5); + printf("}\n"); + printf("\n"); + } + printf("\t\t\tto += %d;\n", 20 * (0x10 - (instance & 0x0F))); // becomes 20 integers + printf("\t\t\tbreak;\n"); + } + else if (instance >> 4 == 7) + { + /* + 36 * 7 bit integers (in two 128-bit words) + */ + for (uint32_t run = 0; run < 0x10 - (instance & 0x0F); run++) + { + printf("{\n"); + printf("\t\t\tconst __m128i byte_stream = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_prefetch(in += 16, _MM_HINT_NTA);\n"); + + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d, _mm_and_si128(byte_stream, mask_7));\n", run * 9); + + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7));\n", run * 9); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7));\n", run * 9); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7));\n", run * 9); + + printf("\t\t\tconst __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_prefetch(in += 16, _MM_HINT_NTA);\n"); + + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7));\n", run * 9); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7));\n", run * 9); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7));\n", run * 9); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7));\n", run * 9); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7));\n", run * 9); + printf("}\n"); + printf("\n"); + } + + printf("\t\t\tto += %d;\n", 36 * (0x10 - (instance & 0x0F))); // becomes 36 integers + printf("\t\t\tbreak;\n"); + } + else if (instance >> 4 == 8) + { + /* + 16 * 8-bit integers + */ + for (uint32_t run = 0; run < 0x10 - (instance & 0x0F); run++) + { + printf("{\n"); + printf("\t\t\tconst __m128i tmp = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_prefetch(in += 16, _MM_HINT_NTA);\n"); + + printf("\t\t\t _mm_storeu_si128((__m128i *)to + %d, _mm_cvtepu8_epi32(tmp));\n", run * 4); + printf("\t\t\t _mm_storeu_si128((__m128i *)to + %d + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01))));\n", run * 4); + printf("\t\t\t const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));\n"); + printf("\t\t\t _mm_storeu_si128((__m128i *)to + %d + 2, _mm_cvtepu8_epi32(tmp3));\n", run * 4); + printf("\t\t\t _mm_storeu_si128((__m128i *)to + %d + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01))));\n", run * 4); + printf("}\n"); + printf("\n"); + } + printf("\t\t\tto += %d;\n", 16 * (0x10 - (instance & 0x0F))); // becomes 16 integers + printf("\t\t\tbreak;\n"); + } + else if (instance >> 4 == 9) + { + /* + 28 * 9-bit ingtegers (in two 128-bit words) + */ + for (uint32_t run = 0; run < 0x10 - (instance & 0x0F); run++) + { + printf("{\n"); + printf("\t\t\tconst __m128i byte_stream = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_prefetch(in += 16, _MM_HINT_NTA);\n"); + + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d, _mm_and_si128(byte_stream, mask_9));\n", run * 7); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9));\n", run * 7); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9));\n", run * 7); + + printf("\t\t\tconst __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_prefetch(in += 16, _MM_HINT_NTA);\n"); + + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9));\n", run * 7); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9));\n", run * 7); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9));\n", run * 7); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9));\n", run * 7); + printf("}\n"); + printf("\n"); + } + + printf("\t\t\tto += %d;\n", 28 * (0x10 - (instance & 0x0F))); // becomes 28 integers + printf("\t\t\tbreak;\n"); + } + else if (instance >> 4 == 10) + { + /* + 12 * 10-bit integers + */ + for (uint32_t run = 0; run < 0x10 - (instance & 0x0F); run++) + { + printf("{\n"); + printf("\t\t\tconst __m128i byte_stream = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_prefetch(in += 16, _MM_HINT_NTA);\n"); + + printf("\t\t\t _mm_storeu_si128((__m128i *)to + %d, _mm_and_si128(byte_stream, mask_10));\n", run * 3); + printf("\t\t\t _mm_storeu_si128((__m128i *)to + %d + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10));\n", run * 3); + printf("\t\t\t _mm_storeu_si128((__m128i *)to + %d + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10));\n", run * 3); + printf("}\n"); + printf("\n"); + } + + printf("\t\t\tto += %d;\n", 12 * (0x10 - (instance & 0x0F))); // becomes 12 integers + printf("\t\t\tbreak;\n"); + } + else if (instance >> 4 == 11) + { + /* + 20 * 12-bit ingtegers (in two 128-bit words) + */ + for (uint32_t run = 0; run < 0x10 - (instance & 0x0F); run++) + { + printf("{\n"); + printf("\t\t\tconst __m128i byte_stream = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_prefetch(in += 16, _MM_HINT_NTA);\n"); + + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d, _mm_and_si128(byte_stream, mask_12));\n", run * 5); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12));\n", run * 5); + + + printf("\t\t\tconst __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_prefetch(in += 16, _MM_HINT_NTA);\n"); + + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12));\n", run * 5); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12));\n", run * 5); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12));\n", run * 5); + + printf("}\n"); + printf("\n"); + } + printf("\t\t\tto += %d;\n", 20 * (0x10 - (instance & 0x0F))); // becomes 20 integers + printf("\t\t\tbreak;\n"); + } + else if (instance >> 4 == 12) + { + /* + 16-bit integers + */ + for (uint32_t run = 0; run < 0x10 - (instance & 0x0F); run++) + { + printf("{\n"); + printf("\t\t\tconst __m128i tmp = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_prefetch(in += 16, _MM_HINT_NTA);\n"); + + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d, _mm_cvtepu16_epi32(tmp));\n", 2 * run); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)))));\n", 2 * run); + + printf("}\n"); + printf("\n"); + } + + printf("\t\t\tto += %d;\n", 8 * (0x10 - (instance & 0x0F))); // becomes 8 integers + printf("\t\t\tbreak;\n"); + } + else if (instance >> 4 == 13) + { + /* + 12 * 21-bit ingtegers (in two 128-bit words) + */ + for (uint32_t run = 0; run < 0x10 - (instance & 0x0F); run++) + { + printf("{\n"); + printf("\t\t\tconst __m128i byte_stream = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_prefetch(in += 16, _MM_HINT_NTA);\n"); + + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d, _mm_and_si128(byte_stream, mask_21));\n", run * 3); + + printf("\t\t\tconst __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_prefetch(in += 16, _MM_HINT_NTA);\n"); + + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21));\n", run * 3); + printf("\t\t\t_mm_storeu_si128((__m128i *)to + %d + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21));\n", run * 3); + + printf("}\n"); + printf("\n"); + } + printf("\t\t\tto += %d;\n", 12 * (0x10 - (instance & 0x0F))); // becomes 8 integers + printf("\t\t\tbreak;\n"); + } + else if (instance >> 4 == 14) + { + /* + 32-bit integers + */ + for (uint32_t run = 0; run < 0x10 - (instance & 0x0F); run++) + { + printf("{\n"); + printf("\t\t\tconst __m128i tmp = _mm_loadu_si128((__m128i *)in);\n"); + printf("\t\t\t_mm_prefetch(in += 16, _MM_HINT_NTA);\n"); + + printf("\t\t\t _mm_storeu_si128((__m128i *)to + %d, tmp);\n", run); + printf("}\n"); + printf("\n"); + } + + printf("\t\t\tto += %d;\n", 4 * (0x10 - (instance & 0x0F))); // becomes 4 integers + printf("\t\t\tbreak;\n"); + } + else if (instance >> 4 == 15) + { + /* + 128-bit integers + if there are fewer than 4 integes then we just bit-pack them in to 8, 16, 24, or 32-bit words + */ + if ((instance & 0x0C) == 0x00) + { + for (uint32_t run = 0; run < 0x04 - (instance & 0x03); run++) + { + printf("\t\t\t_mm_prefetch(in + %d, _MM_HINT_NTA);\n", run + 1); + printf("\t\t\t*(to + %d) = *(uint8_t *)(in + %d);\n", run, run); + } + printf("\t\t\tin += %d;\n", 0x04 - (instance & 0x03)); // 1 byte integer + printf("\t\t\tto += %d;\n", 0x04 - (instance & 0x03)); // becomes 1 integer + } + else if ((instance & 0x0C) == 0x04) + { + for (uint32_t run = 0; run < 0x04 - (instance & 0x03); run++) + { + printf("\t\t\t_mm_prefetch(in + 2 * %d, _MM_HINT_NTA);\n", run + 1); + printf("\t\t\t*(to + %d) = *(uint16_t *)(in + 2 * %d);\n", run, run); + } + printf("\t\t\tin += 2 * %d;\n", 0x04 - (instance & 0x03)); // 2 byte integers + printf("\t\t\tto += %d;\n", 0x04 - (instance & 0x03)); // becomes 1 integer + } + else if ((instance & 0x0C) == 0x08) + { + for (uint32_t run = 0; run < 0x04 - (instance & 0x03); run++) + { + printf("\t\t\t_mm_prefetch(in + 3 * %d, _MM_HINT_NTA);\n", run + 1); + printf("\t\t\t*(to + %d) = (*(uint8_t *)(in + 3 * %d) << 16) | (*(uint8_t *)(in + 3 * %d + 1) << 8) | (*(uint8_t *)(in + 3 * %d + 2));\n", run, run, run, run); + } + printf("\t\t\tin += 3 * %d;\n", 0x04 - (instance & 0x03)); // 3 byte integer + printf("\t\t\tto += %d;\n", 0x04 - (instance & 0x03)); // becomes 1 integer + } + else if ((instance & 0x0C) == 0x0C) + { + for (uint32_t run = 0; run < 0x04 - (instance & 0x03); run++) + { + printf("\t\t\t_mm_prefetch(in + 4 * %d, _MM_HINT_NTA);\n", run + 1); + printf("\t\t\t*(to + %d) = *(uint32_t *)(in + 4 * %d);\n", run, run); + } + printf("\t\t\tin += 4 * %d;\n", 0x04 - (instance & 0x03)); // 4 byte integer + printf("\t\t\tto += %d;\n", 0x04 - (instance & 0x03)); // becomes 1 integer + } + printf("\t\t\tbreak;\n"); + } + else + { + printf("\t\t\tin++;\n"); // dummy, can't occur + } + printf("\t\t\t}\n"); + } + printf("\t\t}\n"); + printf("\t}\n"); + printf("}\n"); + } + +#endif + +#ifdef TEST_ONE_STRING + static uint32_t sequence[]={13,1,1,26,18,3,1,9,4,8,5,19,7,26,1,5,7,3,12,5,39,16,3,5,19,8,18,1,1,1,2,5,9,3,21,2,6,37,3,5,5,18,3,31,3,22,5,17,6,12,6,2,5,10,3,12,51,14,7,8,1,2,3,27,19,1,10,8,2,7,2,9,16,6,6,5,6,4,18,21,13,2,1,11,3,22,2,16,13,61,21,12,51,10,6,31,14,65,15,82,5,4,18,3,1,1,4,34,5,9,4,7,1,25,17,52,60,8,8,4,22,7,49,26,2,72,29,33,6,11,3,8,1,23,37,1,3,1,1,1,3,20,6,1,2,1,1,1,14,2,4,1,6,4,4,3,1,1,2,2,1,9,29,1,10,11,4,10,31}; + + static uint32_t second_compress_buffer[100000]; + static uint32_t second_decompress_buffer[100000]; + + uint32_t second_compress_buffer_size = sizeof(second_compress_buffer) / sizeof(*second_compress_buffer); + uint32_t second_decompress_buffer_size = sizeof(second_decompress_buffer) / sizeof(*second_decompress_buffer); + + /* + CHECK() + ------- + */ + void check(uint32_t *sequence, uint32_t sequence_length) + { + ANT_compress_qmx_v4 compressor; + uint64_t buffer_size; + uint32_t pos; + uint32_t fail; + + memset(second_compress_buffer, 0, second_compress_buffer_size); + memset(second_decompress_buffer, 0, second_decompress_buffer_size); + + compressor.encodeArray(sequence, sequence_length, (uint32_t *)second_compress_buffer, &buffer_size); + second_compress_buffer[buffer_size] = 0; + second_compress_buffer[buffer_size + 1] = 0; + second_compress_buffer[buffer_size + 2] = 0; + second_compress_buffer[buffer_size + 3] = 0; + + for (pos = 0; pos < buffer_size; pos++) + printf("%02X ", ((uint8_t *)second_compress_buffer)[pos]); + puts(""); + + compressor.decodeArray((uint32_t *)second_compress_buffer, buffer_size, (uint32_t *)second_decompress_buffer, sequence_length); + + fail = false; + for (pos = 0; pos < sequence_length; pos++) + if (sequence[pos] != second_decompress_buffer[pos]) + { + printf("p[%d]:%X != %X\n", pos, sequence[pos], second_decompress_buffer[pos]); + fail = true; + } + else + printf("p[%d]:%X == %X\n", pos, sequence[pos], second_decompress_buffer[pos]); + + if (fail) + puts("Test failed"); + else + puts("Test succeeded"); + } + + /* + MAIN() + ------ + */ + int main(void) + { + check(sequence, sizeof(sequence) / sizeof(*sequence)); + } +#endif +/* + ANT_ANT_COMPRESS_QMX_V4::DECODEARRAY() + -------------------------------- + this code was generated by the method above. +*/ +#include "compress_qmx_v4_decompress.cpp" diff --git a/ext/bench_/bench/compress_qmx_v4.h b/ext/bench_/bench/compress_qmx_v4.h new file mode 100644 index 0000000..2961ca9 --- /dev/null +++ b/ext/bench_/bench/compress_qmx_v4.h @@ -0,0 +1,43 @@ +/* + COMPRESS_QMX_V4.H + ----------------- + QMX without run-length encoding, no unwind, no prefetch. +*/ +#ifndef COMPRESS_QMX_V4_H_ +#define COMPRESS_QMX_V4_H_ + +#include +#include "compress.h" + +/* + class ANT_COMPRESS_QMX_V4 + ------------------------- +*/ +class ANT_compress_qmx_v4 : public ANT_compress +{ +private: + uint8_t *length_buffer; + uint64_t length_buffer_length; + +public: + ANT_compress_qmx_v4(); + virtual ~ANT_compress_qmx_v4(); + + void encodeArray(const uint32_t *in, uint64_t len, uint32_t *out, uint64_t *nvalue); + void decodeArray(const uint32_t *in, uint64_t len, uint32_t *out, uint64_t nvalue); + + virtual uint64_t compress(uint8_t *destination, uint64_t destination_length, uint32_t *source, uint64_t source_integers) + { + uint64_t answer; + encodeArray(source, source_integers, (uint32_t *)destination, &answer); + return answer; + } + + virtual void decompress(uint32_t *destination, uint64_t destinaton_integers, uint8_t *source, uint64_t source_length) + { + decodeArray((uint32_t *)source, source_length, destination, destinaton_integers); + } +} ; + +#endif + diff --git a/ext/bench_/bench/compress_qmx_v4_decompress.cpp b/ext/bench_/bench/compress_qmx_v4_decompress.cpp new file mode 100644 index 0000000..1e4d2f6 --- /dev/null +++ b/ext/bench_/bench/compress_qmx_v4_decompress.cpp @@ -0,0 +1,36428 @@ +static uint32_t ALIGN_16 static_mask_21[] = {0x1fffff, 0x1fffff, 0x1fffff, 0x1fffff}; +static uint32_t ALIGN_16 static_mask_12[] = {0xfff, 0xfff, 0xfff, 0xfff}; +static uint32_t ALIGN_16 static_mask_10[] = {0x3ff, 0x3ff, 0x3ff, 0x3ff}; +static uint32_t ALIGN_16 static_mask_9[] = {0x1ff, 0x1ff, 0x1ff, 0x1ff}; +static uint32_t ALIGN_16 static_mask_7[] = {0x7f, 0x7f, 0x7f, 0x7f}; +static uint32_t ALIGN_16 static_mask_6[] = {0x3f, 0x3f, 0x3f, 0x3f}; +static uint32_t ALIGN_16 static_mask_5[] = {0x1f, 0x1f, 0x1f, 0x1f}; +static uint32_t ALIGN_16 static_mask_4[] = {0x0f, 0x0f, 0x0f, 0x0f}; +static uint32_t ALIGN_16 static_mask_3[] = {0x07, 0x07, 0x07, 0x07}; +static uint32_t ALIGN_16 static_mask_2[] = {0x03, 0x03, 0x03, 0x03}; +static uint32_t ALIGN_16 static_mask_1[] = {0x01, 0x01, 0x01, 0x01}; +void ANT_compress_qmx_v4::decodeArray(const uint32_t *source, uint64_t len, uint32_t *to, uint64_t destination_integers) +{ +__m128i mask_21, mask_12, mask_10, mask_9, mask_7, mask_6, mask_5, mask_4, mask_3, mask_2, mask_1; +uint8_t *in = (uint8_t *)source; +uint8_t *keys = ((uint8_t *)source) + len - 1; + +mask_21 = _mm_loadu_si128((__m128i *)static_mask_21); +mask_12 = _mm_loadu_si128((__m128i *)static_mask_12); +mask_10 = _mm_loadu_si128((__m128i *)static_mask_10); +mask_9 = _mm_loadu_si128((__m128i *)static_mask_9); +mask_7 = _mm_loadu_si128((__m128i *)static_mask_7); +mask_6 = _mm_loadu_si128((__m128i *)static_mask_6); +mask_5 = _mm_loadu_si128((__m128i *)static_mask_5); +mask_4 = _mm_loadu_si128((__m128i *)static_mask_4); +mask_3 = _mm_loadu_si128((__m128i *)static_mask_3); +mask_2 = _mm_loadu_si128((__m128i *)static_mask_2); +mask_1 = _mm_loadu_si128((__m128i *)static_mask_1); + +while (in <= keys) // <= because there can be a boundary case where the final key is 255*0 bit integers + { + switch (*keys--) + { + case 0x00: + _mm_prefetch(keys, _MM_HINT_NTA); + { +#ifdef NO_ZEROS + const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); +#endif + _mm_storeu_si128((__m128i *)to + 0, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 64, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 128, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 192, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 256, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 320, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 384, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 448, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 512, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 576, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 640, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 704, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 768, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 832, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 896, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 960, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 960 + 63, tmp); + + to += 4096; + break; + } + case 0x01: + _mm_prefetch(keys, _MM_HINT_NTA); + { +#ifdef NO_ZEROS + const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); +#endif + _mm_storeu_si128((__m128i *)to + 0, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 64, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 128, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 192, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 256, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 320, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 384, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 448, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 512, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 576, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 640, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 704, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 768, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 832, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 896, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 896 + 63, tmp); + + to += 3840; + break; + } + case 0x02: + _mm_prefetch(keys, _MM_HINT_NTA); + { +#ifdef NO_ZEROS + const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); +#endif + _mm_storeu_si128((__m128i *)to + 0, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 64, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 128, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 192, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 256, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 320, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 384, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 448, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 512, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 576, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 640, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 704, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 768, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 832, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 832 + 63, tmp); + + to += 3584; + break; + } + case 0x03: + _mm_prefetch(keys, _MM_HINT_NTA); + { +#ifdef NO_ZEROS + const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); +#endif + _mm_storeu_si128((__m128i *)to + 0, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 64, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 128, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 192, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 256, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 320, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 384, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 448, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 512, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 576, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 640, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 704, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 768, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 768 + 63, tmp); + + to += 3328; + break; + } + case 0x04: + _mm_prefetch(keys, _MM_HINT_NTA); + { +#ifdef NO_ZEROS + const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); +#endif + _mm_storeu_si128((__m128i *)to + 0, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 64, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 128, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 192, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 256, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 320, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 384, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 448, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 512, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 576, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 640, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 704, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 704 + 63, tmp); + + to += 3072; + break; + } + case 0x05: + _mm_prefetch(keys, _MM_HINT_NTA); + { +#ifdef NO_ZEROS + const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); +#endif + _mm_storeu_si128((__m128i *)to + 0, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 64, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 128, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 192, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 256, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 320, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 384, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 448, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 512, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 576, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 640, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 640 + 63, tmp); + + to += 2816; + break; + } + case 0x06: + _mm_prefetch(keys, _MM_HINT_NTA); + { +#ifdef NO_ZEROS + const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); +#endif + _mm_storeu_si128((__m128i *)to + 0, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 64, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 128, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 192, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 256, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 320, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 384, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 448, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 512, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 576, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 576 + 63, tmp); + + to += 2560; + break; + } + case 0x07: + _mm_prefetch(keys, _MM_HINT_NTA); + { +#ifdef NO_ZEROS + const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); +#endif + _mm_storeu_si128((__m128i *)to + 0, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 64, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 128, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 192, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 256, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 320, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 384, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 448, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 512, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 512 + 63, tmp); + + to += 2304; + break; + } + case 0x08: + _mm_prefetch(keys, _MM_HINT_NTA); + { +#ifdef NO_ZEROS + const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); +#endif + _mm_storeu_si128((__m128i *)to + 0, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 64, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 128, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 192, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 256, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 320, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 384, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 448, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 448 + 63, tmp); + + to += 2048; + break; + } + case 0x09: + _mm_prefetch(keys, _MM_HINT_NTA); + { +#ifdef NO_ZEROS + const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); +#endif + _mm_storeu_si128((__m128i *)to + 0, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 64, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 128, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 192, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 256, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 320, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 384, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 384 + 63, tmp); + + to += 1792; + break; + } + case 0x0a: + _mm_prefetch(keys, _MM_HINT_NTA); + { +#ifdef NO_ZEROS + const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); +#endif + _mm_storeu_si128((__m128i *)to + 0, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 64, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 128, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 192, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 256, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 320, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 320 + 63, tmp); + + to += 1536; + break; + } + case 0x0b: + _mm_prefetch(keys, _MM_HINT_NTA); + { +#ifdef NO_ZEROS + const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); +#endif + _mm_storeu_si128((__m128i *)to + 0, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 64, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 128, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 192, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 256, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 256 + 63, tmp); + + to += 1280; + break; + } + case 0x0c: + _mm_prefetch(keys, _MM_HINT_NTA); + { +#ifdef NO_ZEROS + const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); +#endif + _mm_storeu_si128((__m128i *)to + 0, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 64, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 128, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 192, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 192 + 63, tmp); + + to += 1024; + break; + } + case 0x0d: + _mm_prefetch(keys, _MM_HINT_NTA); + { +#ifdef NO_ZEROS + const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); +#endif + _mm_storeu_si128((__m128i *)to + 0, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 64, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 128, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 128 + 63, tmp); + + to += 768; + break; + } + case 0x0e: + _mm_prefetch(keys, _MM_HINT_NTA); + { +#ifdef NO_ZEROS + const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); +#endif + _mm_storeu_si128((__m128i *)to + 0, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); + + _mm_storeu_si128((__m128i *)to + 64, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 64 + 63, tmp); + + to += 512; + break; + } + case 0x0f: + _mm_prefetch(keys, _MM_HINT_NTA); + { +#ifdef NO_ZEROS + const __m128i tmp = _mm_loadu_si128((__m128i *)static_mask_1); +#else + const __m128i tmp = _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); +#endif + _mm_storeu_si128((__m128i *)to + 0, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 1, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 2, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 3, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 4, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 5, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 6, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 7, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 8, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 9, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 10, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 11, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 12, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 13, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 14, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 15, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 16, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 17, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 18, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 19, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 20, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 21, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 22, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 23, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 24, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 25, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 26, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 27, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 28, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 29, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 30, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 31, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 32, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 33, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 34, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 35, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 36, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 37, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 38, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 39, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 40, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 41, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 42, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 43, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 44, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 45, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 46, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 47, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 48, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 49, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 50, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 51, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 52, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 53, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 54, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 55, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 56, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 57, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 58, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 59, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 60, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 61, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 62, tmp); + _mm_storeu_si128((__m128i *)to + 0 + 63, tmp); + + to += 256; + break; + } + case 0x10: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 288, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 320, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 352, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 384, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 416, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 448, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 480, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 480 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + + to += 2048; + break; + } + case 0x11: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 288, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 320, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 352, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 384, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 416, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 448, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 448 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + + to += 1920; + break; + } + case 0x12: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 288, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 320, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 352, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 384, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 416, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 416 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + + to += 1792; + break; + } + case 0x13: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 288, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 320, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 352, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 384, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 384 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + + to += 1664; + break; + } + case 0x14: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 288, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 320, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 352, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 352 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + + to += 1536; + break; + } + case 0x15: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 288, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 320, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 320 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + + to += 1408; + break; + } + case 0x16: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 288, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 288 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + + to += 1280; + break; + } + case 0x17: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 256, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 256 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + + to += 1152; + break; + } + case 0x18: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 224 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + + to += 1024; + break; + } + case 0x19: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 192 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + + to += 896; + break; + } + case 0x1a: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 160 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + + to += 768; + break; + } + case 0x1b: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 128 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + + to += 640; + break; + } + case 0x1c: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 96 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + + to += 512; + break; + } + case 0x1d: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 64 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + + to += 384; + break; + } + case 0x1e: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 32 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + + to += 256; + break; + } + case 0x1f: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 1), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 7), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 11), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 13), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 16, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 17, _mm_and_si128(_mm_srli_epi64(byte_stream, 17), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 18, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 19, _mm_and_si128(_mm_srli_epi64(byte_stream, 19), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 20, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 21, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 22, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 23, _mm_and_si128(_mm_srli_epi64(byte_stream, 23), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 24, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 25, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 26, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 27, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 28, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 29, _mm_and_si128(_mm_srli_epi64(byte_stream, 29), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 30, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_1)); + _mm_storeu_si128((__m128i *)to + 0 + 31, _mm_and_si128(_mm_srli_epi64(byte_stream, 31), mask_1)); +} + + to += 128; + break; + } + case 0x20: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 144, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 176, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 208, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 240, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 240 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 240 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 240 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 240 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 240 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 240 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 240 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 240 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 240 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 240 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 240 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 240 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 240 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 240 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 240 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + + to += 1024; + break; + } + case 0x21: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 144, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 176, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 208, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 224, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 224 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + + to += 960; + break; + } + case 0x22: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 144, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 176, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 208, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 208 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + + to += 896; + break; + } + case 0x23: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 144, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 176, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 192, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 192 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + + to += 832; + break; + } + case 0x24: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 144, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 176, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 176 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + + to += 768; + break; + } + case 0x25: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 144, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 160, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 160 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + + to += 704; + break; + } + case 0x26: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 144, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 144 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + + to += 640; + break; + } + case 0x27: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 128, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 128 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + + to += 576; + break; + } + case 0x28: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 112 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + + to += 512; + break; + } + case 0x29: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 96 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + + to += 448; + break; + } + case 0x2a: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 80 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + + to += 384; + break; + } + case 0x2b: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 64 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + + to += 320; + break; + } + case 0x2c: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 48 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + + to += 256; + break; + } + case 0x2d: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 32 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + + to += 192; + break; + } + case 0x2e: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 16 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + + to += 128; + break; + } + case 0x2f: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 2), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 14), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 10, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 11, _mm_and_si128(_mm_srli_epi64(byte_stream, 22), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 12, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 13, _mm_and_si128(_mm_srli_epi64(byte_stream, 26), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 14, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_2)); + _mm_storeu_si128((__m128i *)to + 0 + 15, _mm_and_si128(_mm_srli_epi64(byte_stream, 30), mask_2)); +} + + to += 64; + break; + } + case 0x30: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 100, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 110, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 120, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 130, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 140, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 150, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 150 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 150 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 150 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 150 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 150 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 150 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 150 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 150 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 150 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + + to += 640; + break; + } + case 0x31: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 100, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 110, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 120, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 130, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 140, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 140 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + + to += 600; + break; + } + case 0x32: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 100, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 110, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 120, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 130, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 130 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + + to += 560; + break; + } + case 0x33: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 100, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 110, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 120, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 120 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + + to += 520; + break; + } + case 0x34: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 100, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 110, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 110 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + + to += 480; + break; + } + case 0x35: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 100, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 100 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + + to += 440; + break; + } + case 0x36: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 90 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + + to += 400; + break; + } + case 0x37: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 80 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + + to += 360; + break; + } + case 0x38: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 70 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + + to += 320; + break; + } + case 0x39: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 60 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + + to += 280; + break; + } + case 0x3a: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 50 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + + to += 240; + break; + } + case 0x3b: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 40 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + + to += 200; + break; + } + case 0x3c: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 30 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + + to += 160; + break; + } + case 0x3d: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 20 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + + to += 120; + break; + } + case 0x3e: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 10 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + + to += 80; + break; + } + case 0x3f: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 3), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 9), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 21), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_3)); + _mm_storeu_si128((__m128i *)to + 0 + 9, _mm_and_si128(_mm_srli_epi64(byte_stream, 27), mask_3)); +} + + to += 40; + break; + } + case 0x40: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 88, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 104, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 120, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 120 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 120 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 120 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 120 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 120 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 120 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 120 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + + to += 512; + break; + } + case 0x41: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 88, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 104, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 112, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 112 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 112 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 112 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 112 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 112 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 112 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 112 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + + to += 480; + break; + } + case 0x42: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 88, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 104, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 104 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + + to += 448; + break; + } + case 0x43: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 88, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 96, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 96 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + + to += 416; + break; + } + case 0x44: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 88, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 88 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + + to += 384; + break; + } + case 0x45: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 80, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 80 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + + to += 352; + break; + } + case 0x46: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + + to += 320; + break; + } + case 0x47: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 64, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 64 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + + to += 288; + break; + } + case 0x48: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 56 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + + to += 256; + break; + } + case 0x49: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 48 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + + to += 224; + break; + } + case 0x4a: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 40 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + + to += 192; + break; + } + case 0x4b: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 32 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + + to += 160; + break; + } + case 0x4c: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 24 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + + to += 128; + break; + } + case 0x4d: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 16 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + + to += 96; + break; + } + case 0x4e: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 8 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + + to += 64; + break; + } + case 0x4f: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 4), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 8), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 16), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_4)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi64(byte_stream, 28), mask_4)); +} + + to += 32; + break; + } + case 0x50: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 66, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 78, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 78 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 78 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 78 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 78 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 78 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 84, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 84 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 84 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 84 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 84 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 84 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + + to += 384; + break; + } + case 0x51: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 66, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 78, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 78 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 78 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 78 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 78 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 78 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 84, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 84 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 84 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 84 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 84 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 84 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + + to += 360; + break; + } + case 0x52: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 66, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 78, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 78 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 78 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 78 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 78 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 78 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + + to += 336; + break; + } + case 0x53: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 66, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + + to += 312; + break; + } + case 0x54: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 66, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 66 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + + to += 288; + break; + } + case 0x55: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 60 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + + to += 264; + break; + } + case 0x56: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + + to += 240; + break; + } + case 0x57: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 48, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 48 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + + to += 216; + break; + } + case 0x58: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + + to += 192; + break; + } + case 0x59: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + + to += 168; + break; + } + case 0x5a: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 30 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + + to += 144; + break; + } + case 0x5b: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 24 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + + to += 120; + break; + } + case 0x5c: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + + to += 96; + break; + } + case 0x5d: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 12 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + + to += 72; + break; + } + case 0x5e: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 6 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + + to += 48; + break; + } + case 0x5f: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 5), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 15), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_5)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi64(byte_stream, 25), mask_5)); +} + + to += 24; + break; + } + case 0x60: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 65, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 65 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 65 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 65 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 65 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 75, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 75 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 75 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 75 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 75 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + + to += 320; + break; + } + case 0x61: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 65, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 65 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 65 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 65 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 65 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + + to += 300; + break; + } + case 0x62: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 65, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 65 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 65 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 65 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 65 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + + to += 280; + break; + } + case 0x63: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + + to += 260; + break; + } + case 0x64: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + + to += 240; + break; + } + case 0x65: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + + to += 220; + break; + } + case 0x66: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + + to += 200; + break; + } + case 0x67: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + + to += 180; + break; + } + case 0x68: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + + to += 160; + break; + } + case 0x69: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + + to += 140; + break; + } + case 0x6a: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + + to += 120; + break; + } + case 0x6b: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + + to += 100; + break; + } + case 0x6c: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + + to += 80; + break; + } + case 0x6d: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + + to += 60; + break; + } + case 0x6e: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + + to += 40; + break; + } + case 0x6f: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 6), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 12), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi64(byte_stream, 18), mask_6)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi64(byte_stream, 24), mask_6)); +} + + to += 20; + break; + } + case 0x70: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 81, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 81 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 99, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 99 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 108, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 108 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 117, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 117 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 126, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 126 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 126 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 126 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 126 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 126 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 126 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 126 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 126 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 135, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 135 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 135 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 135 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 135 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 135 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 135 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 135 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 135 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + + to += 576; + break; + } + case 0x71: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 81, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 81 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 99, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 99 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 108, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 108 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 117, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 117 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 126, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 126 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 126 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 126 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 126 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 126 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 126 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 126 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 126 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + + to += 540; + break; + } + case 0x72: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 81, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 81 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 99, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 99 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 108, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 108 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 117, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 117 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 117 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + + to += 504; + break; + } + case 0x73: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 81, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 81 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 99, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 99 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 108, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 108 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 108 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + + to += 468; + break; + } + case 0x74: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 81, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 81 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 99, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 99 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 99 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + + to += 432; + break; + } + case 0x75: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 81, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 81 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 90, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 90 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 90 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + + to += 396; + break; + } + case 0x76: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 81, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 81 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 81 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + + to += 360; + break; + } + case 0x77: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 72, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 72 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 72 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + + to += 324; + break; + } + case 0x78: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 63 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + + to += 288; + break; + } + case 0x79: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 54, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 54 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 54 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + + to += 252; + break; + } + case 0x7a: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 45 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + + to += 216; + break; + } + case 0x7b: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 36 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + + to += 180; + break; + } + case 0x7c: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 27 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + + to += 144; + break; + } + case 0x7d: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 18 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + + to += 108; + break; + } + case 0x7e: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 9 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + + to += 72; + break; + } + case 0x7f: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 7), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 14), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream, 21), mask_7)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 4), _mm_srli_epi32(byte_stream, 28)), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 3), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 10), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 7, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 17), mask_7)); + _mm_storeu_si128((__m128i *)to + 0 + 8, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 24), mask_7)); +} + + to += 36; + break; + } + case 0x80: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 44, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 44 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 44 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 44 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 48, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 52, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 52 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 52 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 52 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 56, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 60, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + + to += 256; + break; + } + case 0x81: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 44, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 44 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 44 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 44 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 48, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 52, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 52 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 52 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 52 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 56, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + + to += 240; + break; + } + case 0x82: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 44, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 44 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 44 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 44 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 48, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 52, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 52 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 52 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 52 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + + to += 224; + break; + } + case 0x83: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 44, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 44 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 44 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 44 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 48, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 48 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 48 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 48 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + + to += 208; + break; + } + case 0x84: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 44, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 44 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 44 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 44 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + + to += 192; + break; + } + case 0x85: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + + to += 176; + break; + } + case 0x86: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 36 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + + to += 160; + break; + } + case 0x87: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 32, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 32 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 32 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 32 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + + to += 144; + break; + } + case 0x88: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + + to += 128; + break; + } + case 0x89: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 24 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + + to += 112; + break; + } + case 0x8a: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + + to += 96; + break; + } + case 0x8b: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 16 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 16 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + + to += 80; + break; + } + case 0x8c: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 12 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + + to += 64; + break; + } + case 0x8d: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 8 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 8 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + + to += 48; + break; + } + case 0x8e: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 4 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 4 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + + to += 32; + break; + } + case 0x8f: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu8_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp), 0x01)))); + const __m128i tmp3 = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_cvtepu8_epi32(tmp3)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_cvtepu8_epi32(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp3), _mm_castsi128_ps(tmp3), 0x01)))); +} + + to += 16; + break; + } + case 0x90: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 77, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 77 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 84, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 84 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 91, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 91 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 91 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 91 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 91 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 91 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 91 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 98, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 98 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 98 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 98 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 98 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 98 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 98 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 105, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 105 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 105 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 105 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 105 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 105 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 105 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + + to += 448; + break; + } + case 0x91: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 77, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 77 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 84, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 84 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 91, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 91 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 91 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 91 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 91 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 91 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 91 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 98, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 98 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 98 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 98 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 98 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 98 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 98 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + + to += 420; + break; + } + case 0x92: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 77, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 77 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 84, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 84 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 91, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 91 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 91 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 91 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 91 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 91 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 91 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + + to += 392; + break; + } + case 0x93: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 77, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 77 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 84, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 84 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 84 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + + to += 364; + break; + } + case 0x94: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 77, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 77 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 77 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + + to += 336; + break; + } + case 0x95: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 70 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + + to += 308; + break; + } + case 0x96: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 63, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 63 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 63 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + + to += 280; + break; + } + case 0x97: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 56, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 56 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 56 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + + to += 252; + break; + } + case 0x98: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 49, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 49 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 49 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + + to += 224; + break; + } + case 0x99: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 42 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 42 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + + to += 196; + break; + } + case 0x9a: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 35 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + + to += 168; + break; + } + case 0x9b: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 28, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 28 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 28 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + + to += 140; + break; + } + case 0x9c: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 21 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + + to += 112; + break; + } + case 0x9d: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 14 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + + to += 84; + break; + } + case 0x9e: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 7 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + + to += 56; + break; + } + case 0x9f: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 9), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream, 18), mask_9)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 5), _mm_srli_epi32(byte_stream, 27)), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 4), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 5, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 13), mask_9)); + _mm_storeu_si128((__m128i *)to + 0 + 6, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 22), mask_9)); +} + + to += 28; + break; + } + case 0xa0: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 39, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 39 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 39 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + + to += 192; + break; + } + case 0xa1: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 39, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 39 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 39 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + + to += 180; + break; + } + case 0xa2: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 39, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 39 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 39 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + + to += 168; + break; + } + case 0xa3: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + + to += 156; + break; + } + case 0xa4: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + + to += 144; + break; + } + case 0xa5: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + + to += 132; + break; + } + case 0xa6: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + + to += 120; + break; + } + case 0xa7: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + + to += 108; + break; + } + case 0xa8: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + + to += 96; + break; + } + case 0xa9: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + + to += 84; + break; + } + case 0xaa: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + + to += 72; + break; + } + case 0xab: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + + to += 60; + break; + } + case 0xac: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + + to += 48; + break; + } + case 0xad: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + + to += 36; + break; + } + case 0xae: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + + to += 24; + break; + } + case 0xaf: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi64(byte_stream, 10), mask_10)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi64(byte_stream, 20), mask_10)); +} + + to += 12; + break; + } + case 0xb0: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 65, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 65 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 65 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 65 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 65 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 75, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 75 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 75 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 75 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 75 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + + to += 320; + break; + } + case 0xb1: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 65, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 65 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 65 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 65 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 65 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 70, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 70 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 70 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 70 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 70 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + + to += 300; + break; + } + case 0xb2: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 65, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 65 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 65 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 65 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 65 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + + to += 280; + break; + } + case 0xb3: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 60, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 60 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 60 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 60 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 60 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + + to += 260; + break; + } + case 0xb4: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 55, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 55 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 55 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 55 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 55 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + + to += 240; + break; + } + case 0xb5: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 50, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 50 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 50 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + + to += 220; + break; + } + case 0xb6: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 45 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + + to += 200; + break; + } + case 0xb7: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 40 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 40 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + + to += 180; + break; + } + case 0xb8: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 35 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 35 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + + to += 160; + break; + } + case 0xb9: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 30 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + + to += 140; + break; + } + case 0xba: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 25, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 25 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 25 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + + to += 120; + break; + } + case 0xbb: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 20 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + + to += 100; + break; + } + case 0xbc: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 15 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + + to += 80; + break; + } + case 0xbd: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 10 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + + to += 60; + break; + } + case 0xbe: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 5 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + + to += 40; + break; + } + case 0xbf: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_srli_epi32(byte_stream, 12), mask_12)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 8), _mm_srli_epi32(byte_stream, 24)), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 3, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 8), mask_12)); + _mm_storeu_si128((__m128i *)to + 0 + 4, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 20), mask_12)); +} + + to += 20; + break; + } + case 0xc0: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 22, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 22 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 26, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 26 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + + to += 128; + break; + } + case 0xc1: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 22, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 22 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 26, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 26 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 28, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 28 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + + to += 120; + break; + } + case 0xc2: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 22, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 22 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 26, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 26 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + + to += 112; + break; + } + case 0xc3: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 22, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 22 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + + to += 104; + break; + } + case 0xc4: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 22, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 22 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + + to += 96; + break; + } + case 0xc5: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 20, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 20 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + + to += 88; + break; + } + case 0xc6: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + + to += 80; + break; + } + case 0xc7: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 16, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 16 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + + to += 72; + break; + } + case 0xc8: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 14 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + + to += 64; + break; + } + case 0xc9: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + + to += 56; + break; + } + case 0xca: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 10 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + + to += 48; + break; + } + case 0xcb: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 8 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + + to += 40; + break; + } + case 0xcc: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + + to += 32; + break; + } + case 0xcd: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 4 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + + to += 24; + break; + } + case 0xce: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 2, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 2 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + + to += 16; + break; + } + case 0xcf: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_cvtepu16_epi32(tmp)); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_cvtepu16_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))))); +} + + to += 8; + break; + } + case 0xd0: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 39, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 39 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 39 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 45 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 45 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + + to += 192; + break; + } + case 0xd1: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 39, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 39 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 39 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 42, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 42 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 42 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + + to += 180; + break; + } + case 0xd2: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 39, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 39 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 39 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + + to += 168; + break; + } + case 0xd3: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 36 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 36 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + + to += 156; + break; + } + case 0xd4: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 33, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 33 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 33 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + + to += 144; + break; + } + case 0xd5: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 30 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 30 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + + to += 132; + break; + } + case 0xd6: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 27 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 27 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + + to += 120; + break; + } + case 0xd7: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 24 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 24 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + + to += 108; + break; + } + case 0xd8: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 21 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 21 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + + to += 96; + break; + } + case 0xd9: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 18 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 18 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + + to += 84; + break; + } + case 0xda: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 15 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + + to += 72; + break; + } + case 0xdb: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 12 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + + to += 60; + break; + } + case 0xdc: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 9 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + + to += 48; + break; + } + case 0xdd: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 6 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + + to += 36; + break; + } + case 0xde: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 3 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + + to += 24; + break; + } + case 0xdf: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i byte_stream = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, _mm_and_si128(byte_stream, mask_21)); + const __m128i byte_stream_2 = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0 + 1, _mm_and_si128(_mm_or_si128(_mm_slli_epi32(byte_stream_2, 11), _mm_srli_epi32(byte_stream, 21)), mask_21)); + _mm_storeu_si128((__m128i *)to + 0 + 2, _mm_and_si128(_mm_srli_epi32(byte_stream_2, 11), mask_21)); +} + + to += 12; + break; + } + case 0xe0: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 1, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 2, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 11, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 13, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 15, tmp); +} + + to += 64; + break; + } + case 0xe1: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 1, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 2, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 11, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 13, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 14, tmp); +} + + to += 60; + break; + } + case 0xe2: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 1, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 2, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 11, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 13, tmp); +} + + to += 56; + break; + } + case 0xe3: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 1, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 2, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 11, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 12, tmp); +} + + to += 52; + break; + } + case 0xe4: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 1, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 2, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 11, tmp); +} + + to += 48; + break; + } + case 0xe5: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 1, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 2, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 10, tmp); +} + + to += 44; + break; + } + case 0xe6: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 1, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 2, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 9, tmp); +} + + to += 40; + break; + } + case 0xe7: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 1, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 2, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 8, tmp); +} + + to += 36; + break; + } + case 0xe8: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 1, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 2, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 7, tmp); +} + + to += 32; + break; + } + case 0xe9: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 1, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 2, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 6, tmp); +} + + to += 28; + break; + } + case 0xea: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 1, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 2, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 5, tmp); +} + + to += 24; + break; + } + case 0xeb: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 1, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 2, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 4, tmp); +} + + to += 20; + break; + } + case 0xec: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 1, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 2, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 3, tmp); +} + + to += 16; + break; + } + case 0xed: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 1, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 2, tmp); +} + + to += 12; + break; + } + case 0xee: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, tmp); +} + +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 1, tmp); +} + + to += 8; + break; + } + case 0xef: + _mm_prefetch(keys, _MM_HINT_NTA); + { +{ + const __m128i tmp = _mm_loadu_si128((__m128i *)in); + _mm_prefetch(in += 16, _MM_HINT_NTA); + _mm_storeu_si128((__m128i *)to + 0, tmp); +} + + to += 4; + break; + } + case 0xf0: + _mm_prefetch(keys, _MM_HINT_NTA); + { + _mm_prefetch(in + 1, _MM_HINT_NTA); + *(to + 0) = *(uint8_t *)(in + 0); + _mm_prefetch(in + 2, _MM_HINT_NTA); + *(to + 1) = *(uint8_t *)(in + 1); + _mm_prefetch(in + 3, _MM_HINT_NTA); + *(to + 2) = *(uint8_t *)(in + 2); + _mm_prefetch(in + 4, _MM_HINT_NTA); + *(to + 3) = *(uint8_t *)(in + 3); + in += 4; + to += 4; + break; + } + case 0xf1: + _mm_prefetch(keys, _MM_HINT_NTA); + { + _mm_prefetch(in + 1, _MM_HINT_NTA); + *(to + 0) = *(uint8_t *)(in + 0); + _mm_prefetch(in + 2, _MM_HINT_NTA); + *(to + 1) = *(uint8_t *)(in + 1); + _mm_prefetch(in + 3, _MM_HINT_NTA); + *(to + 2) = *(uint8_t *)(in + 2); + in += 3; + to += 3; + break; + } + case 0xf2: + _mm_prefetch(keys, _MM_HINT_NTA); + { + _mm_prefetch(in + 1, _MM_HINT_NTA); + *(to + 0) = *(uint8_t *)(in + 0); + _mm_prefetch(in + 2, _MM_HINT_NTA); + *(to + 1) = *(uint8_t *)(in + 1); + in += 2; + to += 2; + break; + } + case 0xf3: + _mm_prefetch(keys, _MM_HINT_NTA); + { + _mm_prefetch(in + 1, _MM_HINT_NTA); + *(to + 0) = *(uint8_t *)(in + 0); + in += 1; + to += 1; + break; + } + case 0xf4: + _mm_prefetch(keys, _MM_HINT_NTA); + { + _mm_prefetch(in + 2 * 1, _MM_HINT_NTA); + *(to + 0) = *(uint16_t *)(in + 2 * 0); + _mm_prefetch(in + 2 * 2, _MM_HINT_NTA); + *(to + 1) = *(uint16_t *)(in + 2 * 1); + _mm_prefetch(in + 2 * 3, _MM_HINT_NTA); + *(to + 2) = *(uint16_t *)(in + 2 * 2); + _mm_prefetch(in + 2 * 4, _MM_HINT_NTA); + *(to + 3) = *(uint16_t *)(in + 2 * 3); + in += 2 * 4; + to += 4; + break; + } + case 0xf5: + _mm_prefetch(keys, _MM_HINT_NTA); + { + _mm_prefetch(in + 2 * 1, _MM_HINT_NTA); + *(to + 0) = *(uint16_t *)(in + 2 * 0); + _mm_prefetch(in + 2 * 2, _MM_HINT_NTA); + *(to + 1) = *(uint16_t *)(in + 2 * 1); + _mm_prefetch(in + 2 * 3, _MM_HINT_NTA); + *(to + 2) = *(uint16_t *)(in + 2 * 2); + in += 2 * 3; + to += 3; + break; + } + case 0xf6: + _mm_prefetch(keys, _MM_HINT_NTA); + { + _mm_prefetch(in + 2 * 1, _MM_HINT_NTA); + *(to + 0) = *(uint16_t *)(in + 2 * 0); + _mm_prefetch(in + 2 * 2, _MM_HINT_NTA); + *(to + 1) = *(uint16_t *)(in + 2 * 1); + in += 2 * 2; + to += 2; + break; + } + case 0xf7: + _mm_prefetch(keys, _MM_HINT_NTA); + { + _mm_prefetch(in + 2 * 1, _MM_HINT_NTA); + *(to + 0) = *(uint16_t *)(in + 2 * 0); + in += 2 * 1; + to += 1; + break; + } + case 0xf8: + _mm_prefetch(keys, _MM_HINT_NTA); + { + _mm_prefetch(in + 3 * 1, _MM_HINT_NTA); + *(to + 0) = (*(uint8_t *)(in + 3 * 0) << 16) | (*(uint8_t *)(in + 3 * 0 + 1) << 8) | (*(uint8_t *)(in + 3 * 0 + 2)); + _mm_prefetch(in + 3 * 2, _MM_HINT_NTA); + *(to + 1) = (*(uint8_t *)(in + 3 * 1) << 16) | (*(uint8_t *)(in + 3 * 1 + 1) << 8) | (*(uint8_t *)(in + 3 * 1 + 2)); + _mm_prefetch(in + 3 * 3, _MM_HINT_NTA); + *(to + 2) = (*(uint8_t *)(in + 3 * 2) << 16) | (*(uint8_t *)(in + 3 * 2 + 1) << 8) | (*(uint8_t *)(in + 3 * 2 + 2)); + _mm_prefetch(in + 3 * 4, _MM_HINT_NTA); + *(to + 3) = (*(uint8_t *)(in + 3 * 3) << 16) | (*(uint8_t *)(in + 3 * 3 + 1) << 8) | (*(uint8_t *)(in + 3 * 3 + 2)); + in += 3 * 4; + to += 4; + break; + } + case 0xf9: + _mm_prefetch(keys, _MM_HINT_NTA); + { + _mm_prefetch(in + 3 * 1, _MM_HINT_NTA); + *(to + 0) = (*(uint8_t *)(in + 3 * 0) << 16) | (*(uint8_t *)(in + 3 * 0 + 1) << 8) | (*(uint8_t *)(in + 3 * 0 + 2)); + _mm_prefetch(in + 3 * 2, _MM_HINT_NTA); + *(to + 1) = (*(uint8_t *)(in + 3 * 1) << 16) | (*(uint8_t *)(in + 3 * 1 + 1) << 8) | (*(uint8_t *)(in + 3 * 1 + 2)); + _mm_prefetch(in + 3 * 3, _MM_HINT_NTA); + *(to + 2) = (*(uint8_t *)(in + 3 * 2) << 16) | (*(uint8_t *)(in + 3 * 2 + 1) << 8) | (*(uint8_t *)(in + 3 * 2 + 2)); + in += 3 * 3; + to += 3; + break; + } + case 0xfa: + _mm_prefetch(keys, _MM_HINT_NTA); + { + _mm_prefetch(in + 3 * 1, _MM_HINT_NTA); + *(to + 0) = (*(uint8_t *)(in + 3 * 0) << 16) | (*(uint8_t *)(in + 3 * 0 + 1) << 8) | (*(uint8_t *)(in + 3 * 0 + 2)); + _mm_prefetch(in + 3 * 2, _MM_HINT_NTA); + *(to + 1) = (*(uint8_t *)(in + 3 * 1) << 16) | (*(uint8_t *)(in + 3 * 1 + 1) << 8) | (*(uint8_t *)(in + 3 * 1 + 2)); + in += 3 * 2; + to += 2; + break; + } + case 0xfb: + _mm_prefetch(keys, _MM_HINT_NTA); + { + _mm_prefetch(in + 3 * 1, _MM_HINT_NTA); + *(to + 0) = (*(uint8_t *)(in + 3 * 0) << 16) | (*(uint8_t *)(in + 3 * 0 + 1) << 8) | (*(uint8_t *)(in + 3 * 0 + 2)); + in += 3 * 1; + to += 1; + break; + } + case 0xfc: + _mm_prefetch(keys, _MM_HINT_NTA); + { + _mm_prefetch(in + 4 * 1, _MM_HINT_NTA); + *(to + 0) = *(uint32_t *)(in + 4 * 0); + _mm_prefetch(in + 4 * 2, _MM_HINT_NTA); + *(to + 1) = *(uint32_t *)(in + 4 * 1); + _mm_prefetch(in + 4 * 3, _MM_HINT_NTA); + *(to + 2) = *(uint32_t *)(in + 4 * 2); + _mm_prefetch(in + 4 * 4, _MM_HINT_NTA); + *(to + 3) = *(uint32_t *)(in + 4 * 3); + in += 4 * 4; + to += 4; + break; + } + case 0xfd: + _mm_prefetch(keys, _MM_HINT_NTA); + { + _mm_prefetch(in + 4 * 1, _MM_HINT_NTA); + *(to + 0) = *(uint32_t *)(in + 4 * 0); + _mm_prefetch(in + 4 * 2, _MM_HINT_NTA); + *(to + 1) = *(uint32_t *)(in + 4 * 1); + _mm_prefetch(in + 4 * 3, _MM_HINT_NTA); + *(to + 2) = *(uint32_t *)(in + 4 * 2); + in += 4 * 3; + to += 3; + break; + } + case 0xfe: + _mm_prefetch(keys, _MM_HINT_NTA); + { + _mm_prefetch(in + 4 * 1, _MM_HINT_NTA); + *(to + 0) = *(uint32_t *)(in + 4 * 0); + _mm_prefetch(in + 4 * 2, _MM_HINT_NTA); + *(to + 1) = *(uint32_t *)(in + 4 * 1); + in += 4 * 2; + to += 2; + break; + } + case 0xff: + _mm_prefetch(keys, _MM_HINT_NTA); + { + _mm_prefetch(in + 4 * 1, _MM_HINT_NTA); + *(to + 0) = *(uint32_t *)(in + 4 * 0); + in += 4 * 1; + to += 1; + break; + } + } + } +} diff --git a/ext/bench_/bench/compress_turbopackv.h b/ext/bench_/bench/compress_turbopackv.h new file mode 100644 index 0000000..1ca8d15 --- /dev/null +++ b/ext/bench_/bench/compress_turbopackv.h @@ -0,0 +1,33 @@ +/* + COMPRESS_TURBOPACKV.H + ---------------------- +*/ +#ifndef COMPRESS_TURBOPACKV_H_ +#define COMPRESS_TURBOPACKV_H_ + +#include +#include "compress_turbopackv_internals.h" + +/* + class ANT_COMPRESS_TURBOPACKV + ------------------------------ +*/ +class ANT_compress_turbopackv : public ANT_compress +{ +public: + ANT_compress_turbopackv() {} + virtual ~ANT_compress_turbopackv() {} + + virtual uint64_t compress(uint8_t *destination, uint64_t destination_length, uint32_t *source, uint64_t source_integers) + { + return turbopackv_compress(destination, destination_length, source, source_integers); + } + + virtual void decompress(uint32_t *destination, uint64_t destinaton_integers, uint8_t *source, uint64_t source_length) + { + turbopackv_decompress(destination, destinaton_integers, source, source_length); + } +} ; + +#endif + diff --git a/ext/bench_/bench/conf.h b/ext/bench_/bench/conf.h new file mode 100644 index 0000000..21d33dd --- /dev/null +++ b/ext/bench_/bench/conf.h @@ -0,0 +1,207 @@ +/** + Copyright (C) powturbo 2013-2016 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - homepage : https://sites.google.com/site/powturbo/ + - github : https://github.com/powturbo + - twitter : https://twitter.com/powturbo + - email : powturbo [_AT_] gmail [_DOT_] com +**/ + +// conf.h - config & common +#ifndef CONF_H +#define CONF_H +//------------------------- Compiler ------------------------------------------ + #if defined(__GNUC__) +#define ALIGNED(t,v,n) t v __attribute__ ((aligned (n))) +#define ALWAYS_INLINE inline __attribute__((always_inline)) +#define NOINLINE __attribute__((noinline)) +#define _PACKED __attribute__ ((packed)) +#define likely(x) __builtin_expect((x),1) +#define unlikely(x) __builtin_expect((x),0) + +#define popcnt32(_x_) __builtin_popcount(_x_) +#define popcnt64(_x_) __builtin_popcountll(_x_) + + #if defined(__i386__) || defined(__x86_64__) +static inline int __bsr32( int x) { asm("bsr %1,%0" : "=r" (x) : "rm" (x) ); return x; } +static inline int bsr32( int x) { int b = -1; asm("bsrl %1,%0" : "+r" (b) : "rm" (x) ); return b + 1; } +static inline int bsr64(unsigned long long x) { return x?64 - __builtin_clzll(x):0; } +#define bsr16(_x_) bsr32(_x_) + +static inline unsigned rol32(unsigned x, int s) { asm ("roll %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; } +static inline unsigned ror32(unsigned x, int s) { asm ("rorl %%cl,%0" :"=r" (x) :"0" (x),"c" (s)); return x; } + + #else +static inline int bsr32(int x ) { return x?32 - __builtin_clz( x):0; } +static inline int bsr64(unsigned long long x) { return x?64 - __builtin_clzll(x):0; } + +static inline unsigned rol32(unsigned x, int s) { return x << s | x >> (32 - s); } +static inline unsigned ror32(unsigned x, int s) { return x >> s | x << (32 - s); } + #endif + +#define ctz64(_x_) __builtin_ctzll(_x_) +#define ctz32(_x_) __builtin_ctz(_x_) +#define clz64(_x_) __builtin_clzll(_x_) +#define clz32(_x_) __builtin_clz(_x_) + +#if __GNUC_MINOR__ < 8 +static inline unsigned short bswap16(unsigned short a) { return (a<<8)|(a>>8); } +#else +#define bswap16(x) __builtin_bswap16(x) +#endif +#define bswap32(x) __builtin_bswap32(x) +#define bswap64(x) __builtin_bswap64(x) + + #elif _MSC_VER +#define ALIGNED(x) __declspec(align(x)) +#define ALWAYS_INLINE __forceinline +#define NOINLINE __declspec(noinline) +#define inline __inline +#define THREADLOCAL __declspec(thread) +#define likely(x) (x) +#define unlikely(x) (x) +#define __builtin_prefetch(x) //_mm_prefetch(x, _MM_HINT_NTA) + +static inline int bsr32(int x) { return x ? 32 - __builtin_clz(x) : 0; } + #ifdef _WIN64 +static inline int bsr64(unsigned long long x) { unsigned long z = 0; _BitScanForward64(&z, x); return 64 - z; } +static inline int clz64(unsigned long long x) { unsigned long z = 0; _BitScanForward64(&z, x); return z; } +static inline int ctz64(unsigned long long x) { unsigned long z = 0; _BitScanReverse64(&z, x); return z; } + #endif +static inline int clz32(unsigned x) { unsigned z = 0; _BitScanForward( &z, x); return 32 - z; } +static inline int ctz32(unsigned x) { unsigned z = 0; _BitScanReverse( &z, x); return z; } +#define rol32(x,s) _lrotl(x, s) +#define ror32(x,s) _lrotr(x, s) + +#define bswap16(x) _byteswap_ushort(x) +#define bswap32(x) _byteswap_ulong(x) +#define bswap64(x) _byteswap_uint64(x) + +#define fseeko _fseeki64 +#define ftello _ftelli64 +#define sleep(x) Sleep(x/1000) +#define strcasecmp _stricmp +#define strncasecmp _strnicmp + #endif + +#define ctz16(_x_) ctz32(_x_) +#define clz16(_x_) clz32(_x_) +//--------------- Unaligned memory access ------------------------------------- +/*# || defined(i386) || defined(_X86_) || defined(__THW_INTEL)*/ + #if defined(__i386__) || defined(__x86_64__) || \ + defined(_M_IX86) || defined(_M_AMD64) || /*MSC_VER*/\ + defined(__powerpc__) ||\ + defined(__ARM_FEATURE_UNALIGNED) || defined(__aarch64__) || defined(__arm__) ||\ + defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__) || \ + defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__) || \ + defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) +#define ctou16(_cp_) *(unsigned short *)(_cp_) +#define ctou32(_cp_) *(unsigned *)(_cp_) + + #if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) +#define ctou64(_cp_) (*(unsigned long long *)(_cp_)) +#define ctou(_cp_t, _cp_) (*(_cp_t *)(_cp_)) + #endif + + #elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7S__) +struct _PACKED shortu { unsigned short s; }; +struct _PACKED unsignedu { unsigned u; }; +struct _PACKED longu { unsigned long long l; }; + +#define ctou16(_cp_) ((struct shortu *)(_cp_))->s +#define ctou32(_cp_) ((struct unsignedu *)(_cp_))->u +#define ctou64(_cp_) ((struct longu *)(_cp_))->l + #else +#error "unknown cpu" + #endif + + #ifdef ctou16 +#define utoc16(_x_,_cp_) ctou16(_cp_) = _x_ + #else +static inline unsigned short ctou16(const void *cp) { unsigned short x; memcpy(&x, cp, sizeof(x)); return x; } +static inline void utoc16(unsigned short x, void *cp ) { memcpy(cp, &x, sizeof(x)); } + #endif + + #ifdef ctou32 +#define utoc32(_x_,_cp_) ctou32(_cp_) = _x_ + #else +static inline unsigned ctou32(const void *cp) { unsigned x; memcpy(&x, cp, sizeof(x)); return x; } +static inline void utoc32(unsigned x, void *cp ) { memcpy(cp, &x, sizeof(x)); } + #endif + + #ifdef ctou64 +#define utoc64(_x_,_cp_) ctou64(_cp_) = _x_ + #else +static inline unsigned long long ctou64(const void *cp) { unsigned long long x; memcpy(&x, cp, sizeof(x)); return x; } +static inline void utoc64(unsigned long long x, void *cp ) { memcpy(cp, &x, sizeof(x)); } + #endif + +#define ctou24(_cp_) (ctou32(_cp_) & 0xffffff) +#define ctou48(_cp_) (ctou64(_cp_) & 0xffffffffffffull) +#define ctou8(_cp_) (*_cp_) +//--------------------- wordsize ---------------------------------------------- + #if defined(__64BIT__) || defined(_LP64) || defined(__LP64__) || defined(_WIN64) ||\ + defined(__x86_64__) || defined(_M_X64) ||\ + defined(__ia64) || defined(_M_IA64) ||\ + defined(__aarch64__) ||\ + defined(__mips64) ||\ + defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) ||\ + defined(__s390x__) +#define __WORDSIZE 64 + #else +#define __WORDSIZE 32 + #endif +#endif + +//---------------------misc --------------------------------------------------- +#define SIZE_ROUNDUP(_n_, _a_) (((size_t)(_n_) + (size_t)((_a_) - 1)) & ~(size_t)((_a_) - 1)) + #ifndef min +#define min(x,y) (((x)<(y)) ? (x) : (y)) +#define max(x,y) (((x)>(y)) ? (x) : (y)) + #endif + +#define TEMPLATE2_(_x_, _y_) _x_##_y_ +#define TEMPLATE2(_x_, _y_) TEMPLATE2_(_x_,_y_) + +#define TEMPLATE3_(_x_,_y_,_z_) _x_##_y_##_z_ +#define TEMPLATE3(_x_,_y_,_z_) TEMPLATE3_(_x_, _y_, _z_) + +//--- NDEBUG ------- +#include + #ifdef _MSC_VER + #ifdef NDEBUG +#define AS(expr, fmt, ...) +#define AC(expr, fmt, ...) if(!(expr)) { fprintf(stderr, fmt, __VA_ARGS__ ); fflush(stderr); abort(); } +#define die(fmt, ...) do { fprintf(stderr, fmt, __VA_ARGS__ ); fflush(stderr); exit(-1); } while(0) + #else +#define AS(expr, fmt, ...) if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ,__VA_ARGS__ ); fflush(stderr); abort(); } +#define AC(expr, fmt, ...) if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ,__VA_ARGS__ ); fflush(stderr); abort(); } +#define die(fmt, ...) do { fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ,__VA_ARGS__ ); fflush(stderr); exit(-1); } while(0) + #endif + #else + #ifdef NDEBUG +#define AS(expr, fmt,args...) +#define AC(expr, fmt,args...) if(!(expr)) { fprintf(stderr, fmt, ## args ); fflush(stderr); abort(); } +#define die(fmt,args...) do { fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } while(0) + #else +#define AS(expr, fmt,args...) if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); abort(); } +#define AC(expr, fmt,args...) if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); abort(); } +#define die(fmt,args...) do { fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } while(0) + #endif + #endif + diff --git a/ext/bench_/bench/util.h b/ext/bench_/bench/util.h new file mode 100644 index 0000000..b77540c --- /dev/null +++ b/ext/bench_/bench/util.h @@ -0,0 +1,407 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + * and Owen Kaser + */ + +#ifndef UTIL +#define UTIL +#include "common.h" + +#ifdef __linux__ +#define USE_O_DIRECT +#endif + +namespace FastPForLib { + +//#define STATS +// taken from stackoverflow +#ifndef NDEBUG +#define ASSERT(condition, message) \ + do { \ + if (!(condition)) { \ + std::cerr << "Assertion `" #condition "` failed in " << __FILE__ \ + << " line " << __LINE__ << ": " << message << std::endl; \ + std::exit(EXIT_FAILURE); \ + } \ + } while (false) +#else +#define ASSERT(condition, message) \ + do { \ + } while (false) +#endif + +/** + * Computes the greatest common divisor + */ +constexpr __attribute__((const)) uint32_t gcd(uint32_t x, uint32_t y) { + return (x % y) == 0 ? y : gcd(y, x % y); +} + +template __attribute__((const)) T *padTo32bits(T *inbyte) { + return reinterpret_cast((reinterpret_cast(inbyte) + 3) & ~3); +} + +template +__attribute__((const)) const T *padTo32bits(const T *inbyte) { + return reinterpret_cast((reinterpret_cast(inbyte) + 3) & + ~3); +} + +template __attribute__((const)) T *padTo64bits(T *inbyte) { + return reinterpret_cast((reinterpret_cast(inbyte) + 7) & ~7); +} + +template +__attribute__((const)) const T *padTo64bits(const T *inbyte) { + return reinterpret_cast((reinterpret_cast(inbyte) + 7) & + ~7); +} + +template __attribute__((const)) T *padTo128bits(T *inbyte) { + return reinterpret_cast((reinterpret_cast(inbyte) + 15) & + ~15); +} + +template +__attribute__((const)) const T *padTo128bits(const T *inbyte) { + return reinterpret_cast( + (reinterpret_cast(inbyte) + 15) & ~15); +} + +template __attribute__((const)) T *padTo64bytes(T *inbyte) { + return reinterpret_cast((reinterpret_cast(inbyte) + 63) & + ~63); +} + +template +__attribute__((const)) const T *padTo64bytes(const T *inbyte) { + return reinterpret_cast((reinterpret_cast(inbyte) + 63) & + ~63); +} + +template +__attribute__((const)) bool needPaddingTo32Bits(const T *inbyte) { + return (reinterpret_cast(inbyte) & 3) != 0; +} + +template +__attribute__((const)) bool needPaddingTo64Bits(const T *inbyte) { + return (reinterpret_cast(inbyte) & 7) != 0; +} + +template +__attribute__((const)) bool needPaddingTo128Bits(const T *inbyte) { + return (reinterpret_cast(inbyte) & 15) != 0; +} + +template bool needPaddingTo64bytes(const T *inbyte) { + return (reinterpret_cast(inbyte) & 63) != 0; +} + +__attribute__((const)) inline uint32_t gccbits(const uint32_t v) { +#ifdef _MSC_VER + if (v == 0) { + return 0; + } + unsigned long answer; + _BitScanReverse(&answer, v); + return answer + 1; +#else + return v == 0 ? 0 : 32 - __builtin_clz(v); +#endif +} + +#ifdef _MSC_VER +// taken from +// http://stackoverflow.com/questions/355967/how-to-use-msvc-intrinsics-to-get-the-equivalent-of-this-gcc-code +uint32_t __builtin_clz(uint32_t x) { + unsigned long r = 0; + _BitScanReverse(&r, x); + return (31 - r); +} + +#endif + +__attribute__((const)) inline bool divisibleby(size_t a, uint32_t x) { + return (a % x == 0); +} + +/** + * compute the deltas, you do not want to use this + * function if speed matters. This is only for convenience. + */ +template +container diffs(const container &in, const bool aredistinct) { + container out; + if (in.empty()) + return out; + out.resize(in.size() - 1); + for (size_t k = 0; k < in.size() - 1; ++k) + if (aredistinct) + out.push_back(in[k + 1] - in[k] - 1); + else + out.push_back(in[k + 1] - in[k]); + return out; +} + +inline void checkifdivisibleby(size_t a, uint32_t x) { + if (!divisibleby(a, x)) { + std::ostringstream convert; + convert << a << " not divisible by " << x; + throw std::logic_error(convert.str()); + } +} + +template void printme(iter i, iter b) { + for (iter j = i; j != b; ++j) + std::cout << *j << " "; + std::cout << std::endl; +} + +__attribute__((const)) inline uint32_t asmbits(const uint32_t v) { +#ifdef _MSC_VER + return gccbits(v); +#else + if (v == 0) + return 0; + uint32_t answer; + __asm__("bsr %1, %0;" : "=r"(answer) : "r"(v)); + return answer + 1; +#endif +} + +__attribute__((const)) inline uint32_t slowbits(uint32_t v) { + uint32_t r = 0; + while (v) { + r++; + v = v >> 1; + } + return r; +} + +__attribute__((const)) inline uint32_t bits(uint32_t v) { + uint32_t r(0); + if (v >= (1U << 15)) { + v >>= 16; + r += 16; + } + if (v >= (1U << 7)) { + v >>= 8; + r += 8; + } + if (v >= (1U << 3)) { + v >>= 4; + r += 4; + } + if (v >= (1U << 1)) { + v >>= 2; + r += 2; + } + if (v >= (1U << 0)) { + v >>= 1; + r += 1; + } + return r; +} + +#ifndef _MSC_VER +__attribute__((const)) constexpr uint32_t constexprbits(uint32_t v) { + return v >= (1U << 15) + ? 16 + constexprbits(v >> 16) + : (v >= (1U << 7)) + ? 8 + constexprbits(v >> 8) + : (v >= (1U << 3)) + ? 4 + constexprbits(v >> 4) + : (v >= (1U << 1)) + ? 2 + constexprbits(v >> 2) + : (v >= (1U << 0)) ? 1 + constexprbits(v >> 1) + : 0; +} +#else + +template struct exprbits { + enum { value = 1 + exprbits<(N >> 1)>::value }; +}; + +template <> struct exprbits<0> { + enum { value = 0 }; +}; + +#define constexprbits(n) exprbits::value + +#endif + +constexpr uint32_t div_roundup(uint32_t v, uint32_t divisor) { + return (v + (divisor - 1)) / divisor; +} + +template +__attribute__((pure)) uint32_t maxbits(const iterator &begin, + const iterator &end) { + uint32_t accumulator = 0; + for (iterator k = begin; k != end; ++k) { + accumulator |= *k; + } + return gccbits(accumulator); +} + +template +uint32_t slowmaxbits(const iterator &begin, const iterator &end) { + uint32_t accumulator = 0; + for (iterator k = begin; k != end; ++k) { + const uint32_t tb = gccbits(*k); + if (tb > accumulator) + accumulator = tb; + } + return accumulator; +} + +// basically, we can sometimes memoize the maxbits computation +// Since the first scan looks at b input words, the second looks +// at b/2, the third looks at b/3... (total related to harmonic numbers) +// it is probably only worthwhile to memoize the first maybe 20% prefix +// (rest can be "naively" re-scanned if needed) +// also, a useful heuristic should be to start with however many +// bits are required for the first number in the sequence. Or OR +// the first two or three values together (danger, what if you OR +// more than you'd actually use?) +// alternative heuristic is to start with however many bits you used for the +// last encoding. See if it works. Yes: start sequential scan downward. No: +// start sequential scan upward. +// To be tried... + +// template +// struct bitwise_or : public binary_function { +// t operator()(t x, t y) { return x|y; } +//}; + +// +template +int greedy_bit_size_lookahead(const iterator &begin, const iterator &end) { + // assert(end- begin <= b); + std::vector prefixOrBuffer(end - + begin); // consider a preallocated buffer... + + partial_sum(begin, end, prefixOrBuffer.begin(), + [](t x, t y) { return x | y; } // change dl's + to | + // bitwise_or() + ); + // do the bitwise or-ing once only. + if (end - begin == + b) { // expected case, to help out compiler. Should be unrolled + for (int i = 1; i < 31; ++i) + if (prefixOrBuffer[b / i - 1] < (static_cast(1) << i)) + return i; + // assert(false); // cannot get here unless 32+ bits required + return -1; + } else { // general case, maybe less data than we could pack with 1-bit fields + for (int i = 1; i < 31; ++i) { + uint64_t indexToCheck = b / i - 1; + if (indexToCheck >= prefixOrBuffer.size()) + indexToCheck = prefixOrBuffer.size() - 1; + + if (prefixOrBuffer[indexToCheck] < (static_cast(1) << i)) + return i; + } + // assert(false); + return -1; + } +} + +// assume the previous bit size is close to the required bit size +template +int greedy_bit_size_lookahead(const iterator &begin, const iterator &end, + uint32_t previous_size) { + + uint32_t span_length = end - begin; + if (span_length == b) { // work on the specialization later... + // try previous size + if (maxbits(begin, begin + (b / previous_size)) > previous_size) { + // previous_size is too small; go until you find something bigger that + // works + for (uint32_t i = previous_size + 1; + i < previous_size + 32 /* was nothing */; + ++i) // upper bound is only to encourage compiler to unroll + if (maxbits(begin, begin + (b / i)) <= i) + return i; + return -1; // impossible + } else { // previous_size works, but perhaps we can find something smaller + // that also works + uint32_t i; + for (i = previous_size - 1; i /* > 0 */ != previous_size - 32; --i) { + if (i == 0) + break; // This funkiness is to encourage unrolling. + if (maxbits(begin, begin + (b / i)) > i) + break; + } + return i + 1; // either i=0 and we return 1....or i is the first too-small + // size + } + } else { + // same thing with careful checks to avoid reading past end of buffer + uint32_t endIdx = b / previous_size; + if (endIdx >= span_length) + endIdx = span_length; + + if (maxbits(begin, begin + endIdx) > previous_size) { + for (uint32_t i = previous_size + 1;; ++i) { + endIdx = b / i; + if (endIdx >= span_length) + endIdx = span_length; + if (maxbits(begin, begin + endIdx) <= i) + return i; + } + return -1; // impossible + } else { + uint32_t i; + for (i = previous_size - 1; i > 0; --i) { + endIdx = b / i; + if (endIdx >= span_length) + endIdx = span_length; + if (maxbits(begin, begin + endIdx) > i) + break; + } + return i + 1; + } + } +} + +class BitWidthHistoGram { +public: + std::vector histo; + BitWidthHistoGram() : histo(33, 0) {} + + void display(std::string prefix = "") { + double sum = 0; + for (size_t k = 0; k < histo.size(); ++k) + sum += histo[k]; + if (sum == 0) + return; + for (size_t k = 0; k < histo.size(); ++k) { + std::cout << prefix << k << " " << histo[k] / sum << std::endl; + } + } + template void eatIntegers(const container &rawdata) { + for (uint32_t i = 0; i < rawdata.size(); ++i) { + histo[asmbits(rawdata[i])] += 1; + } + } + + template void eatDGaps(const container &rawdata) { + if (rawdata.size() <= 1) + return; + for (uint32_t i = 0; i < rawdata.size() - 1; ++i) { + assert(rawdata[i + 1] > rawdata[i]); + uint32_t gap = rawdata[i + 1] - rawdata[i] - 1; + assert(gap < rawdata[i + 1]); + histo[asmbits(gap)] += 1; + } + } +}; + +} // namespace FastPFor + +#endif diff --git a/ext/bitshuffle b/ext/bitshuffle new file mode 160000 index 0000000..57ec156 --- /dev/null +++ b/ext/bitshuffle @@ -0,0 +1 @@ +Subproject commit 57ec1563869de120f70f474e639ce0c4dfc6774a diff --git a/ext/c-blosc2 b/ext/c-blosc2 new file mode 160000 index 0000000..1415512 --- /dev/null +++ b/ext/c-blosc2 @@ -0,0 +1 @@ +Subproject commit 14155126a20ab343f28e1596fa94e2942aa34beb diff --git a/ext/ext.c b/ext/ext.c deleted file mode 100644 index 1d2e3c5..0000000 --- a/ext/ext.c +++ /dev/null @@ -1,120 +0,0 @@ -//-------------------------------------- External functions for comparison ------------------------------------------------------------------------ - -// simple-8b simple16 and optpfd don't work with all interger lists. Enable if you to want to test -//#define _SIMPLE_8B // crashs on some lists -//#define _SIMPLE16 // limited to 28 bits -//#define _OPTPFD // compression too slow and limited to 28 bits. crashs on some lists -//#define _VBYTEPOLY // limited to 28 bits. - #ifdef __SSSE3__ -#define _VARINTG8IU -#define _MASKEDVBYTE // http://maskedvbyte.org - #endif - -//- Optional external libraries. Activate also in makefile ----- -//#define _LIBFOR // libfor -#define _QMX - -//#define _BTSHUF // https://github.com/kiyo-masui/bitshuffle - -#define _LZ4 -//#define _BLOSC // https://github.com/Blosc/c-blosc -//#define _ZLIB -//#define _LZT // LzTurbo not inluded -//------------------------------------------------------------------------------- -#include "vabyte.h" // Standard Variable Byte - -#include "simdcomp/include/simdbitpacking.h" // SIMD FastPFor -#include "simdcomp/include/simdcomputil.h" -#include "simdcomp/include/simdintegratedbitpacking.h" - - #ifdef _VARINTG8IU -#include "varintg8iu.h" // SIMD Varint G8IU -#include "varintg8iu.h" - #endif - -#include "vas16c.h" // Simple 16 -#include "vas16d.h" - -#include "OPT_PFD/opt_p4.h" // OptPFD -#include "simple8b.h" // optimized simple-8b - - #ifdef _MASKEDVBYTE -#include "MaskedVByte/include/varintencode.h" -#include "MaskedVByte/include/varintdecode.h" - #endif - - #ifdef _LIBFOR -#include "for/for.h" - #endif - - #ifdef _QMX -#include "qmx/compress_qmx.h" - #endif - - #ifdef _ZLIB -#include - #endif - - #ifdef _LZ4 -#include "lz4.h" - #endif - - #ifdef _BLOSC -#include "c-blosc/blosc/shuffle.h" -#include "c-blosc/blosc/blosc.h" - #endif - - #ifdef _BTSHUF -#include "bitshuffle/src/bitshuffle.h" - #endif - - #ifdef _LZT -#include "../../lz/lz8.h" -int lz8c0( struct lzobj *lz); -int lz8c01(struct lzobj *lz); -int lz8d( struct lzobj *lz); - -#include "../../lz/lzb.h" -int lzbc0( struct lzobj *lz); -int lzbc01(struct lzobj *lz); -int lzbc2( struct lzobj *lz); -int lzbd( struct lzobj *lz); - #endif -//---------------- FastPFor functions --------------------- -unsigned char *simdpackwn(uint32_t *in, uint32_t n, uint32_t b, uint32_t *out) {//checkifdivisibleby(n, 128); const uint32_t * const initout(out); //while(needPaddingTo128Bits(out)) *out++ = 123456; - uint32_t *in_; - for(in_ = in + n; in + 128 <= in_; in += 128, out += 4 * b) simdpackwithoutmask(in, (__m128i *)out, b); - return (unsigned char *)out; -} - -unsigned char *simdpackn(uint32_t *in, uint32_t n, uint32_t b, uint32_t *out) {//checkifdivisibleby(n, 128); const uint32_t * const initout(out); //while(needPaddingTo128Bits(out)) *out++ = 123456; - uint32_t *in_; - for(in_ = in + n; in + 128 <= in_; in += 128, out += 4 * b) simdpack(in, (__m128i *)out, b); - return (unsigned char *)out; -} - -unsigned char *simdunpackn(uint32_t *in, uint32_t n, uint32_t b, uint32_t *out) { - uint32_t k, *out_; - for(out_ = out + n; out + 128 <= out_; out += 128, in += 4 * b) simdunpack((const __m128i *)in, out, b); - return (unsigned char *)in; -} - -unsigned char *simdpackwn1(uint32_t *in, uint32_t n, uint32_t b, uint32_t start, uint32_t *out) {//checkifdivisibleby(n, 128); const uint32_t * const initout(out); //while(needPaddingTo128Bits(out)) *out++ = 123456; - uint32_t *in_; - for(in_ = in + n; in + 128 <= in_; in += 128, out += 4 * b) simdpackwithoutmaskd1(start, in, (__m128i *)out, b); //simdpackwithoutmaskd1(x, ip+1, (__m128i *)out, b); - return (unsigned char *)out; -} - -unsigned char *simdunpackn1(uint32_t *in, uint32_t n, uint32_t b, uint32_t start, uint32_t *out) { - uint32_t k, *out_; - for(out_ = out + n; out + 128 <= out_; out += 128, in += 4 * b) simdunpackd1(start, (__m128i *)in, out, b); - return (unsigned char *)in; -} -//--------------- Polytec variable byte ----------- -#include "vbyte_poly.h" -unsigned char *vbpolyenc(unsigned *in, unsigned n, unsigned char *out) { - unsigned i; for(i = 0; i < n; i++) { unsigned x = in[i]; VBYTE_ENC(out, x); } return out; -} -unsigned char *vbpolydec(unsigned char *in, unsigned n, unsigned *out) { - unsigned i; for(i = 0; i < n; i++) { unsigned x; VBYTE_DEC(in, x); out[i] = x; } return in; -} diff --git a/ext/for/LICENSE b/ext/for/LICENSE deleted file mode 100644 index d1752ef..0000000 --- a/ext/for/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "{}" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright {yyyy} {name of copyright owner} - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/ext/for/for-gen.c b/ext/for/for-gen.c deleted file mode 100644 index 7f63200..0000000 --- a/ext/for/for-gen.c +++ /dev/null @@ -1,28187 +0,0 @@ -/* This file was generated. - * - * The pack/unpack routines will not work on big-endian architectures. - */ - -static uint32_t -pack0_n(uint32_t base, const uint32_t *in, uint8_t *out) { - (void)base; - (void)in; - (void)out; - return 0; -} - -static uint32_t -unpack0_n(uint32_t base, const uint8_t *in, uint32_t *out) { - int k; - (void)in; - for (k = 0; k < 32; ++k) { - out[k] = base; - } - return 0; -} - -static uint32_t -pack0_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { - (void)base; - (void)in; - (void)out; - (void)length; - return 0; -} - -static uint32_t -unpack0_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { - uint32_t k; - (void)in; - for (k = 0; k < length; ++k) { - out[k] = base; - } - return 0; -} - -static uint32_t -linsearch0_n(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - (void)in; - if (base == value) - *found = 0; - return 0; -} - -static uint32_t -linsearch0_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, - int *found) { - (void)in; - if (base == value && length > 0) - *found = 0; - return 0; -} - -static uint32_t -pack1_32(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 1; - tmp |= (*(in + 2) - base) << 2; - tmp |= (*(in + 3) - base) << 3; - tmp |= (*(in + 4) - base) << 4; - tmp |= (*(in + 5) - base) << 5; - tmp |= (*(in + 6) - base) << 6; - tmp |= (*(in + 7) - base) << 7; - tmp |= (*(in + 8) - base) << 8; - tmp |= (*(in + 9) - base) << 9; - tmp |= (*(in + 10) - base) << 10; - tmp |= (*(in + 11) - base) << 11; - tmp |= (*(in + 12) - base) << 12; - tmp |= (*(in + 13) - base) << 13; - tmp |= (*(in + 14) - base) << 14; - tmp |= (*(in + 15) - base) << 15; - tmp |= (*(in + 16) - base) << 16; - tmp |= (*(in + 17) - base) << 17; - tmp |= (*(in + 18) - base) << 18; - tmp |= (*(in + 19) - base) << 19; - tmp |= (*(in + 20) - base) << 20; - tmp |= (*(in + 21) - base) << 21; - tmp |= (*(in + 22) - base) << 22; - tmp |= (*(in + 23) - base) << 23; - tmp |= (*(in + 24) - base) << 24; - tmp |= (*(in + 25) - base) << 25; - tmp |= (*(in + 26) - base) << 26; - tmp |= (*(in + 27) - base) << 27; - tmp |= (*(in + 28) - base) << 28; - tmp |= (*(in + 29) - base) << 29; - tmp |= (*(in + 30) - base) << 30; - tmp |= (*(in + 31) - base) << 31; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 4) */ - memcpy(out, &tmp, length); - return 4; -} - -static uint32_t -unpack1_32(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 1); - *(out + 1) = base + ((*in32 >> 1) & 1); - *(out + 2) = base + ((*in32 >> 2) & 1); - *(out + 3) = base + ((*in32 >> 3) & 1); - *(out + 4) = base + ((*in32 >> 4) & 1); - *(out + 5) = base + ((*in32 >> 5) & 1); - *(out + 6) = base + ((*in32 >> 6) & 1); - *(out + 7) = base + ((*in32 >> 7) & 1); - *(out + 8) = base + ((*in32 >> 8) & 1); - *(out + 9) = base + ((*in32 >> 9) & 1); - *(out + 10) = base + ((*in32 >> 10) & 1); - *(out + 11) = base + ((*in32 >> 11) & 1); - *(out + 12) = base + ((*in32 >> 12) & 1); - *(out + 13) = base + ((*in32 >> 13) & 1); - *(out + 14) = base + ((*in32 >> 14) & 1); - *(out + 15) = base + ((*in32 >> 15) & 1); - *(out + 16) = base + ((*in32 >> 16) & 1); - *(out + 17) = base + ((*in32 >> 17) & 1); - *(out + 18) = base + ((*in32 >> 18) & 1); - *(out + 19) = base + ((*in32 >> 19) & 1); - *(out + 20) = base + ((*in32 >> 20) & 1); - *(out + 21) = base + ((*in32 >> 21) & 1); - *(out + 22) = base + ((*in32 >> 22) & 1); - *(out + 23) = base + ((*in32 >> 23) & 1); - *(out + 24) = base + ((*in32 >> 24) & 1); - *(out + 25) = base + ((*in32 >> 25) & 1); - *(out + 26) = base + ((*in32 >> 26) & 1); - *(out + 27) = base + ((*in32 >> 27) & 1); - *(out + 28) = base + ((*in32 >> 28) & 1); - *(out + 29) = base + ((*in32 >> 29) & 1); - *(out + 30) = base + ((*in32 >> 30) & 1); - *(out + 31) = base + ((*in32 >> 31) & 1); - /* remaining: 0 bits */ - return 4; -} - -static uint32_t -pack2_32(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 2; - tmp |= (*(in + 2) - base) << 4; - tmp |= (*(in + 3) - base) << 6; - tmp |= (*(in + 4) - base) << 8; - tmp |= (*(in + 5) - base) << 10; - tmp |= (*(in + 6) - base) << 12; - tmp |= (*(in + 7) - base) << 14; - tmp |= (*(in + 8) - base) << 16; - tmp |= (*(in + 9) - base) << 18; - tmp |= (*(in + 10) - base) << 20; - tmp |= (*(in + 11) - base) << 22; - tmp |= (*(in + 12) - base) << 24; - tmp |= (*(in + 13) - base) << 26; - tmp |= (*(in + 14) - base) << 28; - tmp |= (*(in + 15) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 16) - base) << 0; - tmp |= (*(in + 17) - base) << 2; - tmp |= (*(in + 18) - base) << 4; - tmp |= (*(in + 19) - base) << 6; - tmp |= (*(in + 20) - base) << 8; - tmp |= (*(in + 21) - base) << 10; - tmp |= (*(in + 22) - base) << 12; - tmp |= (*(in + 23) - base) << 14; - tmp |= (*(in + 24) - base) << 16; - tmp |= (*(in + 25) - base) << 18; - tmp |= (*(in + 26) - base) << 20; - tmp |= (*(in + 27) - base) << 22; - tmp |= (*(in + 28) - base) << 24; - tmp |= (*(in + 29) - base) << 26; - tmp |= (*(in + 30) - base) << 28; - tmp |= (*(in + 31) - base) << 30; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 8) */ - memcpy(out, &tmp, length); - return 8; -} - -static uint32_t -unpack2_32(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 3); - *(out + 1) = base + ((*in32 >> 2) & 3); - *(out + 2) = base + ((*in32 >> 4) & 3); - *(out + 3) = base + ((*in32 >> 6) & 3); - *(out + 4) = base + ((*in32 >> 8) & 3); - *(out + 5) = base + ((*in32 >> 10) & 3); - *(out + 6) = base + ((*in32 >> 12) & 3); - *(out + 7) = base + ((*in32 >> 14) & 3); - *(out + 8) = base + ((*in32 >> 16) & 3); - *(out + 9) = base + ((*in32 >> 18) & 3); - *(out + 10) = base + ((*in32 >> 20) & 3); - *(out + 11) = base + ((*in32 >> 22) & 3); - *(out + 12) = base + ((*in32 >> 24) & 3); - *(out + 13) = base + ((*in32 >> 26) & 3); - *(out + 14) = base + ((*in32 >> 28) & 3); - *(out + 15) = base + ((*in32 >> 30) & 3); - in32++; - /* consumed: 4 bytes (total: 8) */ - *(out + 16) = base + ((*in32 >> 0) & 3); - *(out + 17) = base + ((*in32 >> 2) & 3); - *(out + 18) = base + ((*in32 >> 4) & 3); - *(out + 19) = base + ((*in32 >> 6) & 3); - *(out + 20) = base + ((*in32 >> 8) & 3); - *(out + 21) = base + ((*in32 >> 10) & 3); - *(out + 22) = base + ((*in32 >> 12) & 3); - *(out + 23) = base + ((*in32 >> 14) & 3); - *(out + 24) = base + ((*in32 >> 16) & 3); - *(out + 25) = base + ((*in32 >> 18) & 3); - *(out + 26) = base + ((*in32 >> 20) & 3); - *(out + 27) = base + ((*in32 >> 22) & 3); - *(out + 28) = base + ((*in32 >> 24) & 3); - *(out + 29) = base + ((*in32 >> 26) & 3); - *(out + 30) = base + ((*in32 >> 28) & 3); - *(out + 31) = base + ((*in32 >> 30) & 3); - /* remaining: 0 bits */ - return 8; -} - -static uint32_t -pack3_32(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 3; - tmp |= (*(in + 2) - base) << 6; - tmp |= (*(in + 3) - base) << 9; - tmp |= (*(in + 4) - base) << 12; - tmp |= (*(in + 5) - base) << 15; - tmp |= (*(in + 6) - base) << 18; - tmp |= (*(in + 7) - base) << 21; - tmp |= (*(in + 8) - base) << 24; - tmp |= (*(in + 9) - base) << 27; - tmp |= (*(in + 10) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 10) - base) >> (3 - 1); - tmp |= (*(in + 11) - base) << 1; - tmp |= (*(in + 12) - base) << 4; - tmp |= (*(in + 13) - base) << 7; - tmp |= (*(in + 14) - base) << 10; - tmp |= (*(in + 15) - base) << 13; - tmp |= (*(in + 16) - base) << 16; - tmp |= (*(in + 17) - base) << 19; - tmp |= (*(in + 18) - base) << 22; - tmp |= (*(in + 19) - base) << 25; - tmp |= (*(in + 20) - base) << 28; - tmp |= (*(in + 21) - base) << 31; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 21) - base) >> (3 - 2); - tmp |= (*(in + 22) - base) << 2; - tmp |= (*(in + 23) - base) << 5; - tmp |= (*(in + 24) - base) << 8; - tmp |= (*(in + 25) - base) << 11; - tmp |= (*(in + 26) - base) << 14; - tmp |= (*(in + 27) - base) << 17; - tmp |= (*(in + 28) - base) << 20; - tmp |= (*(in + 29) - base) << 23; - tmp |= (*(in + 30) - base) << 26; - tmp |= (*(in + 31) - base) << 29; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 12) */ - memcpy(out, &tmp, length); - return 12; -} - -static uint32_t -unpack3_32(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 7); - *(out + 1) = base + ((*in32 >> 3) & 7); - *(out + 2) = base + ((*in32 >> 6) & 7); - *(out + 3) = base + ((*in32 >> 9) & 7); - *(out + 4) = base + ((*in32 >> 12) & 7); - *(out + 5) = base + ((*in32 >> 15) & 7); - *(out + 6) = base + ((*in32 >> 18) & 7); - *(out + 7) = base + ((*in32 >> 21) & 7); - *(out + 8) = base + ((*in32 >> 24) & 7); - *(out + 9) = base + ((*in32 >> 27) & 7); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 1)) << (3 - 1); - *(out + 10) = base + tmp; - *(out + 11) = base + ((*in32 >> 1) & 7); - *(out + 12) = base + ((*in32 >> 4) & 7); - *(out + 13) = base + ((*in32 >> 7) & 7); - *(out + 14) = base + ((*in32 >> 10) & 7); - *(out + 15) = base + ((*in32 >> 13) & 7); - *(out + 16) = base + ((*in32 >> 16) & 7); - *(out + 17) = base + ((*in32 >> 19) & 7); - *(out + 18) = base + ((*in32 >> 22) & 7); - *(out + 19) = base + ((*in32 >> 25) & 7); - *(out + 20) = base + ((*in32 >> 28) & 7); - tmp = (*in32 >> 31); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 2)) << (3 - 2); - *(out + 21) = base + tmp; - *(out + 22) = base + ((*in32 >> 2) & 7); - *(out + 23) = base + ((*in32 >> 5) & 7); - *(out + 24) = base + ((*in32 >> 8) & 7); - *(out + 25) = base + ((*in32 >> 11) & 7); - *(out + 26) = base + ((*in32 >> 14) & 7); - *(out + 27) = base + ((*in32 >> 17) & 7); - *(out + 28) = base + ((*in32 >> 20) & 7); - *(out + 29) = base + ((*in32 >> 23) & 7); - *(out + 30) = base + ((*in32 >> 26) & 7); - *(out + 31) = base + ((*in32 >> 29) & 7); - /* remaining: 0 bits */ - return 12; -} - -static uint32_t -pack4_32(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 4; - tmp |= (*(in + 2) - base) << 8; - tmp |= (*(in + 3) - base) << 12; - tmp |= (*(in + 4) - base) << 16; - tmp |= (*(in + 5) - base) << 20; - tmp |= (*(in + 6) - base) << 24; - tmp |= (*(in + 7) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 8) - base) << 0; - tmp |= (*(in + 9) - base) << 4; - tmp |= (*(in + 10) - base) << 8; - tmp |= (*(in + 11) - base) << 12; - tmp |= (*(in + 12) - base) << 16; - tmp |= (*(in + 13) - base) << 20; - tmp |= (*(in + 14) - base) << 24; - tmp |= (*(in + 15) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 16) - base) << 0; - tmp |= (*(in + 17) - base) << 4; - tmp |= (*(in + 18) - base) << 8; - tmp |= (*(in + 19) - base) << 12; - tmp |= (*(in + 20) - base) << 16; - tmp |= (*(in + 21) - base) << 20; - tmp |= (*(in + 22) - base) << 24; - tmp |= (*(in + 23) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 24) - base) << 0; - tmp |= (*(in + 25) - base) << 4; - tmp |= (*(in + 26) - base) << 8; - tmp |= (*(in + 27) - base) << 12; - tmp |= (*(in + 28) - base) << 16; - tmp |= (*(in + 29) - base) << 20; - tmp |= (*(in + 30) - base) << 24; - tmp |= (*(in + 31) - base) << 28; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 16) */ - memcpy(out, &tmp, length); - return 16; -} - -static uint32_t -unpack4_32(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 15); - *(out + 1) = base + ((*in32 >> 4) & 15); - *(out + 2) = base + ((*in32 >> 8) & 15); - *(out + 3) = base + ((*in32 >> 12) & 15); - *(out + 4) = base + ((*in32 >> 16) & 15); - *(out + 5) = base + ((*in32 >> 20) & 15); - *(out + 6) = base + ((*in32 >> 24) & 15); - *(out + 7) = base + ((*in32 >> 28) & 15); - in32++; - /* consumed: 4 bytes (total: 8) */ - *(out + 8) = base + ((*in32 >> 0) & 15); - *(out + 9) = base + ((*in32 >> 4) & 15); - *(out + 10) = base + ((*in32 >> 8) & 15); - *(out + 11) = base + ((*in32 >> 12) & 15); - *(out + 12) = base + ((*in32 >> 16) & 15); - *(out + 13) = base + ((*in32 >> 20) & 15); - *(out + 14) = base + ((*in32 >> 24) & 15); - *(out + 15) = base + ((*in32 >> 28) & 15); - in32++; - /* consumed: 4 bytes (total: 12) */ - *(out + 16) = base + ((*in32 >> 0) & 15); - *(out + 17) = base + ((*in32 >> 4) & 15); - *(out + 18) = base + ((*in32 >> 8) & 15); - *(out + 19) = base + ((*in32 >> 12) & 15); - *(out + 20) = base + ((*in32 >> 16) & 15); - *(out + 21) = base + ((*in32 >> 20) & 15); - *(out + 22) = base + ((*in32 >> 24) & 15); - *(out + 23) = base + ((*in32 >> 28) & 15); - in32++; - /* consumed: 4 bytes (total: 16) */ - *(out + 24) = base + ((*in32 >> 0) & 15); - *(out + 25) = base + ((*in32 >> 4) & 15); - *(out + 26) = base + ((*in32 >> 8) & 15); - *(out + 27) = base + ((*in32 >> 12) & 15); - *(out + 28) = base + ((*in32 >> 16) & 15); - *(out + 29) = base + ((*in32 >> 20) & 15); - *(out + 30) = base + ((*in32 >> 24) & 15); - *(out + 31) = base + ((*in32 >> 28) & 15); - /* remaining: 0 bits */ - return 16; -} - -static uint32_t -pack5_32(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 5; - tmp |= (*(in + 2) - base) << 10; - tmp |= (*(in + 3) - base) << 15; - tmp |= (*(in + 4) - base) << 20; - tmp |= (*(in + 5) - base) << 25; - tmp |= (*(in + 6) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 6) - base) >> (5 - 3); - tmp |= (*(in + 7) - base) << 3; - tmp |= (*(in + 8) - base) << 8; - tmp |= (*(in + 9) - base) << 13; - tmp |= (*(in + 10) - base) << 18; - tmp |= (*(in + 11) - base) << 23; - tmp |= (*(in + 12) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 12) - base) >> (5 - 1); - tmp |= (*(in + 13) - base) << 1; - tmp |= (*(in + 14) - base) << 6; - tmp |= (*(in + 15) - base) << 11; - tmp |= (*(in + 16) - base) << 16; - tmp |= (*(in + 17) - base) << 21; - tmp |= (*(in + 18) - base) << 26; - tmp |= (*(in + 19) - base) << 31; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 19) - base) >> (5 - 4); - tmp |= (*(in + 20) - base) << 4; - tmp |= (*(in + 21) - base) << 9; - tmp |= (*(in + 22) - base) << 14; - tmp |= (*(in + 23) - base) << 19; - tmp |= (*(in + 24) - base) << 24; - tmp |= (*(in + 25) - base) << 29; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 25) - base) >> (5 - 2); - tmp |= (*(in + 26) - base) << 2; - tmp |= (*(in + 27) - base) << 7; - tmp |= (*(in + 28) - base) << 12; - tmp |= (*(in + 29) - base) << 17; - tmp |= (*(in + 30) - base) << 22; - tmp |= (*(in + 31) - base) << 27; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 20) */ - memcpy(out, &tmp, length); - return 20; -} - -static uint32_t -unpack5_32(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 31); - *(out + 1) = base + ((*in32 >> 5) & 31); - *(out + 2) = base + ((*in32 >> 10) & 31); - *(out + 3) = base + ((*in32 >> 15) & 31); - *(out + 4) = base + ((*in32 >> 20) & 31); - *(out + 5) = base + ((*in32 >> 25) & 31); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 3)) << (5 - 3); - *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 3) & 31); - *(out + 8) = base + ((*in32 >> 8) & 31); - *(out + 9) = base + ((*in32 >> 13) & 31); - *(out + 10) = base + ((*in32 >> 18) & 31); - *(out + 11) = base + ((*in32 >> 23) & 31); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 1)) << (5 - 1); - *(out + 12) = base + tmp; - *(out + 13) = base + ((*in32 >> 1) & 31); - *(out + 14) = base + ((*in32 >> 6) & 31); - *(out + 15) = base + ((*in32 >> 11) & 31); - *(out + 16) = base + ((*in32 >> 16) & 31); - *(out + 17) = base + ((*in32 >> 21) & 31); - *(out + 18) = base + ((*in32 >> 26) & 31); - tmp = (*in32 >> 31); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 4)) << (5 - 4); - *(out + 19) = base + tmp; - *(out + 20) = base + ((*in32 >> 4) & 31); - *(out + 21) = base + ((*in32 >> 9) & 31); - *(out + 22) = base + ((*in32 >> 14) & 31); - *(out + 23) = base + ((*in32 >> 19) & 31); - *(out + 24) = base + ((*in32 >> 24) & 31); - tmp = (*in32 >> 29); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 2)) << (5 - 2); - *(out + 25) = base + tmp; - *(out + 26) = base + ((*in32 >> 2) & 31); - *(out + 27) = base + ((*in32 >> 7) & 31); - *(out + 28) = base + ((*in32 >> 12) & 31); - *(out + 29) = base + ((*in32 >> 17) & 31); - *(out + 30) = base + ((*in32 >> 22) & 31); - *(out + 31) = base + ((*in32 >> 27) & 31); - /* remaining: 0 bits */ - return 20; -} - -static uint32_t -pack6_32(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 6; - tmp |= (*(in + 2) - base) << 12; - tmp |= (*(in + 3) - base) << 18; - tmp |= (*(in + 4) - base) << 24; - tmp |= (*(in + 5) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 5) - base) >> (6 - 4); - tmp |= (*(in + 6) - base) << 4; - tmp |= (*(in + 7) - base) << 10; - tmp |= (*(in + 8) - base) << 16; - tmp |= (*(in + 9) - base) << 22; - tmp |= (*(in + 10) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 10) - base) >> (6 - 2); - tmp |= (*(in + 11) - base) << 2; - tmp |= (*(in + 12) - base) << 8; - tmp |= (*(in + 13) - base) << 14; - tmp |= (*(in + 14) - base) << 20; - tmp |= (*(in + 15) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 16) - base) << 0; - tmp |= (*(in + 17) - base) << 6; - tmp |= (*(in + 18) - base) << 12; - tmp |= (*(in + 19) - base) << 18; - tmp |= (*(in + 20) - base) << 24; - tmp |= (*(in + 21) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 21) - base) >> (6 - 4); - tmp |= (*(in + 22) - base) << 4; - tmp |= (*(in + 23) - base) << 10; - tmp |= (*(in + 24) - base) << 16; - tmp |= (*(in + 25) - base) << 22; - tmp |= (*(in + 26) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 26) - base) >> (6 - 2); - tmp |= (*(in + 27) - base) << 2; - tmp |= (*(in + 28) - base) << 8; - tmp |= (*(in + 29) - base) << 14; - tmp |= (*(in + 30) - base) << 20; - tmp |= (*(in + 31) - base) << 26; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 24) */ - memcpy(out, &tmp, length); - return 24; -} - -static uint32_t -unpack6_32(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 63); - *(out + 1) = base + ((*in32 >> 6) & 63); - *(out + 2) = base + ((*in32 >> 12) & 63); - *(out + 3) = base + ((*in32 >> 18) & 63); - *(out + 4) = base + ((*in32 >> 24) & 63); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 4)) << (6 - 4); - *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 4) & 63); - *(out + 7) = base + ((*in32 >> 10) & 63); - *(out + 8) = base + ((*in32 >> 16) & 63); - *(out + 9) = base + ((*in32 >> 22) & 63); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 2)) << (6 - 2); - *(out + 10) = base + tmp; - *(out + 11) = base + ((*in32 >> 2) & 63); - *(out + 12) = base + ((*in32 >> 8) & 63); - *(out + 13) = base + ((*in32 >> 14) & 63); - *(out + 14) = base + ((*in32 >> 20) & 63); - *(out + 15) = base + ((*in32 >> 26) & 63); - in32++; - /* consumed: 4 bytes (total: 16) */ - *(out + 16) = base + ((*in32 >> 0) & 63); - *(out + 17) = base + ((*in32 >> 6) & 63); - *(out + 18) = base + ((*in32 >> 12) & 63); - *(out + 19) = base + ((*in32 >> 18) & 63); - *(out + 20) = base + ((*in32 >> 24) & 63); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 4)) << (6 - 4); - *(out + 21) = base + tmp; - *(out + 22) = base + ((*in32 >> 4) & 63); - *(out + 23) = base + ((*in32 >> 10) & 63); - *(out + 24) = base + ((*in32 >> 16) & 63); - *(out + 25) = base + ((*in32 >> 22) & 63); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 2)) << (6 - 2); - *(out + 26) = base + tmp; - *(out + 27) = base + ((*in32 >> 2) & 63); - *(out + 28) = base + ((*in32 >> 8) & 63); - *(out + 29) = base + ((*in32 >> 14) & 63); - *(out + 30) = base + ((*in32 >> 20) & 63); - *(out + 31) = base + ((*in32 >> 26) & 63); - /* remaining: 0 bits */ - return 24; -} - -static uint32_t -pack7_32(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 7; - tmp |= (*(in + 2) - base) << 14; - tmp |= (*(in + 3) - base) << 21; - tmp |= (*(in + 4) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 4) - base) >> (7 - 3); - tmp |= (*(in + 5) - base) << 3; - tmp |= (*(in + 6) - base) << 10; - tmp |= (*(in + 7) - base) << 17; - tmp |= (*(in + 8) - base) << 24; - tmp |= (*(in + 9) - base) << 31; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 9) - base) >> (7 - 6); - tmp |= (*(in + 10) - base) << 6; - tmp |= (*(in + 11) - base) << 13; - tmp |= (*(in + 12) - base) << 20; - tmp |= (*(in + 13) - base) << 27; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 13) - base) >> (7 - 2); - tmp |= (*(in + 14) - base) << 2; - tmp |= (*(in + 15) - base) << 9; - tmp |= (*(in + 16) - base) << 16; - tmp |= (*(in + 17) - base) << 23; - tmp |= (*(in + 18) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 18) - base) >> (7 - 5); - tmp |= (*(in + 19) - base) << 5; - tmp |= (*(in + 20) - base) << 12; - tmp |= (*(in + 21) - base) << 19; - tmp |= (*(in + 22) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 22) - base) >> (7 - 1); - tmp |= (*(in + 23) - base) << 1; - tmp |= (*(in + 24) - base) << 8; - tmp |= (*(in + 25) - base) << 15; - tmp |= (*(in + 26) - base) << 22; - tmp |= (*(in + 27) - base) << 29; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 27) - base) >> (7 - 4); - tmp |= (*(in + 28) - base) << 4; - tmp |= (*(in + 29) - base) << 11; - tmp |= (*(in + 30) - base) << 18; - tmp |= (*(in + 31) - base) << 25; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 28) */ - memcpy(out, &tmp, length); - return 28; -} - -static uint32_t -unpack7_32(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 127); - *(out + 1) = base + ((*in32 >> 7) & 127); - *(out + 2) = base + ((*in32 >> 14) & 127); - *(out + 3) = base + ((*in32 >> 21) & 127); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 3)) << (7 - 3); - *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 3) & 127); - *(out + 6) = base + ((*in32 >> 10) & 127); - *(out + 7) = base + ((*in32 >> 17) & 127); - *(out + 8) = base + ((*in32 >> 24) & 127); - tmp = (*in32 >> 31); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 6)) << (7 - 6); - *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 6) & 127); - *(out + 11) = base + ((*in32 >> 13) & 127); - *(out + 12) = base + ((*in32 >> 20) & 127); - tmp = (*in32 >> 27); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 2)) << (7 - 2); - *(out + 13) = base + tmp; - *(out + 14) = base + ((*in32 >> 2) & 127); - *(out + 15) = base + ((*in32 >> 9) & 127); - *(out + 16) = base + ((*in32 >> 16) & 127); - *(out + 17) = base + ((*in32 >> 23) & 127); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 5)) << (7 - 5); - *(out + 18) = base + tmp; - *(out + 19) = base + ((*in32 >> 5) & 127); - *(out + 20) = base + ((*in32 >> 12) & 127); - *(out + 21) = base + ((*in32 >> 19) & 127); - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 1)) << (7 - 1); - *(out + 22) = base + tmp; - *(out + 23) = base + ((*in32 >> 1) & 127); - *(out + 24) = base + ((*in32 >> 8) & 127); - *(out + 25) = base + ((*in32 >> 15) & 127); - *(out + 26) = base + ((*in32 >> 22) & 127); - tmp = (*in32 >> 29); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 4)) << (7 - 4); - *(out + 27) = base + tmp; - *(out + 28) = base + ((*in32 >> 4) & 127); - *(out + 29) = base + ((*in32 >> 11) & 127); - *(out + 30) = base + ((*in32 >> 18) & 127); - *(out + 31) = base + ((*in32 >> 25) & 127); - /* remaining: 0 bits */ - return 28; -} - -static uint32_t -pack8_32(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 8; - tmp |= (*(in + 2) - base) << 16; - tmp |= (*(in + 3) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 4) - base) << 0; - tmp |= (*(in + 5) - base) << 8; - tmp |= (*(in + 6) - base) << 16; - tmp |= (*(in + 7) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 8) - base) << 0; - tmp |= (*(in + 9) - base) << 8; - tmp |= (*(in + 10) - base) << 16; - tmp |= (*(in + 11) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 12) - base) << 0; - tmp |= (*(in + 13) - base) << 8; - tmp |= (*(in + 14) - base) << 16; - tmp |= (*(in + 15) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 16) - base) << 0; - tmp |= (*(in + 17) - base) << 8; - tmp |= (*(in + 18) - base) << 16; - tmp |= (*(in + 19) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 20) - base) << 0; - tmp |= (*(in + 21) - base) << 8; - tmp |= (*(in + 22) - base) << 16; - tmp |= (*(in + 23) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 24) - base) << 0; - tmp |= (*(in + 25) - base) << 8; - tmp |= (*(in + 26) - base) << 16; - tmp |= (*(in + 27) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 28) - base) << 0; - tmp |= (*(in + 29) - base) << 8; - tmp |= (*(in + 30) - base) << 16; - tmp |= (*(in + 31) - base) << 24; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 32) */ - memcpy(out, &tmp, length); - return 32; -} - -static uint32_t -unpack8_32(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 255); - *(out + 1) = base + ((*in32 >> 8) & 255); - *(out + 2) = base + ((*in32 >> 16) & 255); - *(out + 3) = base + ((*in32 >> 24) & 255); - in32++; - /* consumed: 4 bytes (total: 8) */ - *(out + 4) = base + ((*in32 >> 0) & 255); - *(out + 5) = base + ((*in32 >> 8) & 255); - *(out + 6) = base + ((*in32 >> 16) & 255); - *(out + 7) = base + ((*in32 >> 24) & 255); - in32++; - /* consumed: 4 bytes (total: 12) */ - *(out + 8) = base + ((*in32 >> 0) & 255); - *(out + 9) = base + ((*in32 >> 8) & 255); - *(out + 10) = base + ((*in32 >> 16) & 255); - *(out + 11) = base + ((*in32 >> 24) & 255); - in32++; - /* consumed: 4 bytes (total: 16) */ - *(out + 12) = base + ((*in32 >> 0) & 255); - *(out + 13) = base + ((*in32 >> 8) & 255); - *(out + 14) = base + ((*in32 >> 16) & 255); - *(out + 15) = base + ((*in32 >> 24) & 255); - in32++; - /* consumed: 4 bytes (total: 20) */ - *(out + 16) = base + ((*in32 >> 0) & 255); - *(out + 17) = base + ((*in32 >> 8) & 255); - *(out + 18) = base + ((*in32 >> 16) & 255); - *(out + 19) = base + ((*in32 >> 24) & 255); - in32++; - /* consumed: 4 bytes (total: 24) */ - *(out + 20) = base + ((*in32 >> 0) & 255); - *(out + 21) = base + ((*in32 >> 8) & 255); - *(out + 22) = base + ((*in32 >> 16) & 255); - *(out + 23) = base + ((*in32 >> 24) & 255); - in32++; - /* consumed: 4 bytes (total: 28) */ - *(out + 24) = base + ((*in32 >> 0) & 255); - *(out + 25) = base + ((*in32 >> 8) & 255); - *(out + 26) = base + ((*in32 >> 16) & 255); - *(out + 27) = base + ((*in32 >> 24) & 255); - in32++; - /* consumed: 4 bytes (total: 32) */ - *(out + 28) = base + ((*in32 >> 0) & 255); - *(out + 29) = base + ((*in32 >> 8) & 255); - *(out + 30) = base + ((*in32 >> 16) & 255); - *(out + 31) = base + ((*in32 >> 24) & 255); - /* remaining: 0 bits */ - return 32; -} - -static uint32_t -pack9_32(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 9; - tmp |= (*(in + 2) - base) << 18; - tmp |= (*(in + 3) - base) << 27; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 3) - base) >> (9 - 4); - tmp |= (*(in + 4) - base) << 4; - tmp |= (*(in + 5) - base) << 13; - tmp |= (*(in + 6) - base) << 22; - tmp |= (*(in + 7) - base) << 31; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 7) - base) >> (9 - 8); - tmp |= (*(in + 8) - base) << 8; - tmp |= (*(in + 9) - base) << 17; - tmp |= (*(in + 10) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 10) - base) >> (9 - 3); - tmp |= (*(in + 11) - base) << 3; - tmp |= (*(in + 12) - base) << 12; - tmp |= (*(in + 13) - base) << 21; - tmp |= (*(in + 14) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 14) - base) >> (9 - 7); - tmp |= (*(in + 15) - base) << 7; - tmp |= (*(in + 16) - base) << 16; - tmp |= (*(in + 17) - base) << 25; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 17) - base) >> (9 - 2); - tmp |= (*(in + 18) - base) << 2; - tmp |= (*(in + 19) - base) << 11; - tmp |= (*(in + 20) - base) << 20; - tmp |= (*(in + 21) - base) << 29; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 21) - base) >> (9 - 6); - tmp |= (*(in + 22) - base) << 6; - tmp |= (*(in + 23) - base) << 15; - tmp |= (*(in + 24) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 24) - base) >> (9 - 1); - tmp |= (*(in + 25) - base) << 1; - tmp |= (*(in + 26) - base) << 10; - tmp |= (*(in + 27) - base) << 19; - tmp |= (*(in + 28) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 28) - base) >> (9 - 5); - tmp |= (*(in + 29) - base) << 5; - tmp |= (*(in + 30) - base) << 14; - tmp |= (*(in + 31) - base) << 23; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 36) */ - memcpy(out, &tmp, length); - return 36; -} - -static uint32_t -unpack9_32(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 511); - *(out + 1) = base + ((*in32 >> 9) & 511); - *(out + 2) = base + ((*in32 >> 18) & 511); - tmp = (*in32 >> 27); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 4)) << (9 - 4); - *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 4) & 511); - *(out + 5) = base + ((*in32 >> 13) & 511); - *(out + 6) = base + ((*in32 >> 22) & 511); - tmp = (*in32 >> 31); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 8)) << (9 - 8); - *(out + 7) = base + tmp; - *(out + 8) = base + ((*in32 >> 8) & 511); - *(out + 9) = base + ((*in32 >> 17) & 511); - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 3)) << (9 - 3); - *(out + 10) = base + tmp; - *(out + 11) = base + ((*in32 >> 3) & 511); - *(out + 12) = base + ((*in32 >> 12) & 511); - *(out + 13) = base + ((*in32 >> 21) & 511); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 7)) << (9 - 7); - *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 7) & 511); - *(out + 16) = base + ((*in32 >> 16) & 511); - tmp = (*in32 >> 25); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 2)) << (9 - 2); - *(out + 17) = base + tmp; - *(out + 18) = base + ((*in32 >> 2) & 511); - *(out + 19) = base + ((*in32 >> 11) & 511); - *(out + 20) = base + ((*in32 >> 20) & 511); - tmp = (*in32 >> 29); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 6)) << (9 - 6); - *(out + 21) = base + tmp; - *(out + 22) = base + ((*in32 >> 6) & 511); - *(out + 23) = base + ((*in32 >> 15) & 511); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 1)) << (9 - 1); - *(out + 24) = base + tmp; - *(out + 25) = base + ((*in32 >> 1) & 511); - *(out + 26) = base + ((*in32 >> 10) & 511); - *(out + 27) = base + ((*in32 >> 19) & 511); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 5)) << (9 - 5); - *(out + 28) = base + tmp; - *(out + 29) = base + ((*in32 >> 5) & 511); - *(out + 30) = base + ((*in32 >> 14) & 511); - *(out + 31) = base + ((*in32 >> 23) & 511); - /* remaining: 0 bits */ - return 36; -} - -static uint32_t -pack10_32(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 10; - tmp |= (*(in + 2) - base) << 20; - tmp |= (*(in + 3) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 3) - base) >> (10 - 8); - tmp |= (*(in + 4) - base) << 8; - tmp |= (*(in + 5) - base) << 18; - tmp |= (*(in + 6) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 6) - base) >> (10 - 6); - tmp |= (*(in + 7) - base) << 6; - tmp |= (*(in + 8) - base) << 16; - tmp |= (*(in + 9) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 9) - base) >> (10 - 4); - tmp |= (*(in + 10) - base) << 4; - tmp |= (*(in + 11) - base) << 14; - tmp |= (*(in + 12) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 12) - base) >> (10 - 2); - tmp |= (*(in + 13) - base) << 2; - tmp |= (*(in + 14) - base) << 12; - tmp |= (*(in + 15) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 16) - base) << 0; - tmp |= (*(in + 17) - base) << 10; - tmp |= (*(in + 18) - base) << 20; - tmp |= (*(in + 19) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 19) - base) >> (10 - 8); - tmp |= (*(in + 20) - base) << 8; - tmp |= (*(in + 21) - base) << 18; - tmp |= (*(in + 22) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 22) - base) >> (10 - 6); - tmp |= (*(in + 23) - base) << 6; - tmp |= (*(in + 24) - base) << 16; - tmp |= (*(in + 25) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 25) - base) >> (10 - 4); - tmp |= (*(in + 26) - base) << 4; - tmp |= (*(in + 27) - base) << 14; - tmp |= (*(in + 28) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 28) - base) >> (10 - 2); - tmp |= (*(in + 29) - base) << 2; - tmp |= (*(in + 30) - base) << 12; - tmp |= (*(in + 31) - base) << 22; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 40) */ - memcpy(out, &tmp, length); - return 40; -} - -static uint32_t -unpack10_32(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 1023); - *(out + 1) = base + ((*in32 >> 10) & 1023); - *(out + 2) = base + ((*in32 >> 20) & 1023); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 8)) << (10 - 8); - *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 8) & 1023); - *(out + 5) = base + ((*in32 >> 18) & 1023); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 6)) << (10 - 6); - *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 6) & 1023); - *(out + 8) = base + ((*in32 >> 16) & 1023); - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 4)) << (10 - 4); - *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 4) & 1023); - *(out + 11) = base + ((*in32 >> 14) & 1023); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 2)) << (10 - 2); - *(out + 12) = base + tmp; - *(out + 13) = base + ((*in32 >> 2) & 1023); - *(out + 14) = base + ((*in32 >> 12) & 1023); - *(out + 15) = base + ((*in32 >> 22) & 1023); - in32++; - /* consumed: 4 bytes (total: 24) */ - *(out + 16) = base + ((*in32 >> 0) & 1023); - *(out + 17) = base + ((*in32 >> 10) & 1023); - *(out + 18) = base + ((*in32 >> 20) & 1023); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 8)) << (10 - 8); - *(out + 19) = base + tmp; - *(out + 20) = base + ((*in32 >> 8) & 1023); - *(out + 21) = base + ((*in32 >> 18) & 1023); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 6)) << (10 - 6); - *(out + 22) = base + tmp; - *(out + 23) = base + ((*in32 >> 6) & 1023); - *(out + 24) = base + ((*in32 >> 16) & 1023); - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 4)) << (10 - 4); - *(out + 25) = base + tmp; - *(out + 26) = base + ((*in32 >> 4) & 1023); - *(out + 27) = base + ((*in32 >> 14) & 1023); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 40) */ - tmp |= (*in32 % (1U << 2)) << (10 - 2); - *(out + 28) = base + tmp; - *(out + 29) = base + ((*in32 >> 2) & 1023); - *(out + 30) = base + ((*in32 >> 12) & 1023); - *(out + 31) = base + ((*in32 >> 22) & 1023); - /* remaining: 0 bits */ - return 40; -} - -static uint32_t -pack11_32(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 11; - tmp |= (*(in + 2) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) >> (11 - 1); - tmp |= (*(in + 3) - base) << 1; - tmp |= (*(in + 4) - base) << 12; - tmp |= (*(in + 5) - base) << 23; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 5) - base) >> (11 - 2); - tmp |= (*(in + 6) - base) << 2; - tmp |= (*(in + 7) - base) << 13; - tmp |= (*(in + 8) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 8) - base) >> (11 - 3); - tmp |= (*(in + 9) - base) << 3; - tmp |= (*(in + 10) - base) << 14; - tmp |= (*(in + 11) - base) << 25; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 11) - base) >> (11 - 4); - tmp |= (*(in + 12) - base) << 4; - tmp |= (*(in + 13) - base) << 15; - tmp |= (*(in + 14) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 14) - base) >> (11 - 5); - tmp |= (*(in + 15) - base) << 5; - tmp |= (*(in + 16) - base) << 16; - tmp |= (*(in + 17) - base) << 27; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 17) - base) >> (11 - 6); - tmp |= (*(in + 18) - base) << 6; - tmp |= (*(in + 19) - base) << 17; - tmp |= (*(in + 20) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 20) - base) >> (11 - 7); - tmp |= (*(in + 21) - base) << 7; - tmp |= (*(in + 22) - base) << 18; - tmp |= (*(in + 23) - base) << 29; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 23) - base) >> (11 - 8); - tmp |= (*(in + 24) - base) << 8; - tmp |= (*(in + 25) - base) << 19; - tmp |= (*(in + 26) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 26) - base) >> (11 - 9); - tmp |= (*(in + 27) - base) << 9; - tmp |= (*(in + 28) - base) << 20; - tmp |= (*(in + 29) - base) << 31; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 29) - base) >> (11 - 10); - tmp |= (*(in + 30) - base) << 10; - tmp |= (*(in + 31) - base) << 21; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 44) */ - memcpy(out, &tmp, length); - return 44; -} - -static uint32_t -unpack11_32(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 2047); - *(out + 1) = base + ((*in32 >> 11) & 2047); - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 1)) << (11 - 1); - *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 1) & 2047); - *(out + 4) = base + ((*in32 >> 12) & 2047); - tmp = (*in32 >> 23); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 2)) << (11 - 2); - *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 2) & 2047); - *(out + 7) = base + ((*in32 >> 13) & 2047); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 3)) << (11 - 3); - *(out + 8) = base + tmp; - *(out + 9) = base + ((*in32 >> 3) & 2047); - *(out + 10) = base + ((*in32 >> 14) & 2047); - tmp = (*in32 >> 25); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 4)) << (11 - 4); - *(out + 11) = base + tmp; - *(out + 12) = base + ((*in32 >> 4) & 2047); - *(out + 13) = base + ((*in32 >> 15) & 2047); - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 5)) << (11 - 5); - *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 5) & 2047); - *(out + 16) = base + ((*in32 >> 16) & 2047); - tmp = (*in32 >> 27); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 6)) << (11 - 6); - *(out + 17) = base + tmp; - *(out + 18) = base + ((*in32 >> 6) & 2047); - *(out + 19) = base + ((*in32 >> 17) & 2047); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 7)) << (11 - 7); - *(out + 20) = base + tmp; - *(out + 21) = base + ((*in32 >> 7) & 2047); - *(out + 22) = base + ((*in32 >> 18) & 2047); - tmp = (*in32 >> 29); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 8)) << (11 - 8); - *(out + 23) = base + tmp; - *(out + 24) = base + ((*in32 >> 8) & 2047); - *(out + 25) = base + ((*in32 >> 19) & 2047); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 40) */ - tmp |= (*in32 % (1U << 9)) << (11 - 9); - *(out + 26) = base + tmp; - *(out + 27) = base + ((*in32 >> 9) & 2047); - *(out + 28) = base + ((*in32 >> 20) & 2047); - tmp = (*in32 >> 31); - in32++; - /* consumed: 4 bytes (total: 44) */ - tmp |= (*in32 % (1U << 10)) << (11 - 10); - *(out + 29) = base + tmp; - *(out + 30) = base + ((*in32 >> 10) & 2047); - *(out + 31) = base + ((*in32 >> 21) & 2047); - /* remaining: 0 bits */ - return 44; -} - -static uint32_t -pack12_32(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 12; - tmp |= (*(in + 2) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) >> (12 - 4); - tmp |= (*(in + 3) - base) << 4; - tmp |= (*(in + 4) - base) << 16; - tmp |= (*(in + 5) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 5) - base) >> (12 - 8); - tmp |= (*(in + 6) - base) << 8; - tmp |= (*(in + 7) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 8) - base) << 0; - tmp |= (*(in + 9) - base) << 12; - tmp |= (*(in + 10) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 10) - base) >> (12 - 4); - tmp |= (*(in + 11) - base) << 4; - tmp |= (*(in + 12) - base) << 16; - tmp |= (*(in + 13) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 13) - base) >> (12 - 8); - tmp |= (*(in + 14) - base) << 8; - tmp |= (*(in + 15) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 16) - base) << 0; - tmp |= (*(in + 17) - base) << 12; - tmp |= (*(in + 18) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 18) - base) >> (12 - 4); - tmp |= (*(in + 19) - base) << 4; - tmp |= (*(in + 20) - base) << 16; - tmp |= (*(in + 21) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 21) - base) >> (12 - 8); - tmp |= (*(in + 22) - base) << 8; - tmp |= (*(in + 23) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 24) - base) << 0; - tmp |= (*(in + 25) - base) << 12; - tmp |= (*(in + 26) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 26) - base) >> (12 - 4); - tmp |= (*(in + 27) - base) << 4; - tmp |= (*(in + 28) - base) << 16; - tmp |= (*(in + 29) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 29) - base) >> (12 - 8); - tmp |= (*(in + 30) - base) << 8; - tmp |= (*(in + 31) - base) << 20; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 48) */ - memcpy(out, &tmp, length); - return 48; -} - -static uint32_t -unpack12_32(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 4095); - *(out + 1) = base + ((*in32 >> 12) & 4095); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 4)) << (12 - 4); - *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 4) & 4095); - *(out + 4) = base + ((*in32 >> 16) & 4095); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 8)) << (12 - 8); - *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 8) & 4095); - *(out + 7) = base + ((*in32 >> 20) & 4095); - in32++; - /* consumed: 4 bytes (total: 16) */ - *(out + 8) = base + ((*in32 >> 0) & 4095); - *(out + 9) = base + ((*in32 >> 12) & 4095); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 4)) << (12 - 4); - *(out + 10) = base + tmp; - *(out + 11) = base + ((*in32 >> 4) & 4095); - *(out + 12) = base + ((*in32 >> 16) & 4095); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 8)) << (12 - 8); - *(out + 13) = base + tmp; - *(out + 14) = base + ((*in32 >> 8) & 4095); - *(out + 15) = base + ((*in32 >> 20) & 4095); - in32++; - /* consumed: 4 bytes (total: 28) */ - *(out + 16) = base + ((*in32 >> 0) & 4095); - *(out + 17) = base + ((*in32 >> 12) & 4095); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 4)) << (12 - 4); - *(out + 18) = base + tmp; - *(out + 19) = base + ((*in32 >> 4) & 4095); - *(out + 20) = base + ((*in32 >> 16) & 4095); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 8)) << (12 - 8); - *(out + 21) = base + tmp; - *(out + 22) = base + ((*in32 >> 8) & 4095); - *(out + 23) = base + ((*in32 >> 20) & 4095); - in32++; - /* consumed: 4 bytes (total: 40) */ - *(out + 24) = base + ((*in32 >> 0) & 4095); - *(out + 25) = base + ((*in32 >> 12) & 4095); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 44) */ - tmp |= (*in32 % (1U << 4)) << (12 - 4); - *(out + 26) = base + tmp; - *(out + 27) = base + ((*in32 >> 4) & 4095); - *(out + 28) = base + ((*in32 >> 16) & 4095); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 48) */ - tmp |= (*in32 % (1U << 8)) << (12 - 8); - *(out + 29) = base + tmp; - *(out + 30) = base + ((*in32 >> 8) & 4095); - *(out + 31) = base + ((*in32 >> 20) & 4095); - /* remaining: 0 bits */ - return 48; -} - -static uint32_t -pack13_32(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 13; - tmp |= (*(in + 2) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) >> (13 - 7); - tmp |= (*(in + 3) - base) << 7; - tmp |= (*(in + 4) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 4) - base) >> (13 - 1); - tmp |= (*(in + 5) - base) << 1; - tmp |= (*(in + 6) - base) << 14; - tmp |= (*(in + 7) - base) << 27; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 7) - base) >> (13 - 8); - tmp |= (*(in + 8) - base) << 8; - tmp |= (*(in + 9) - base) << 21; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 9) - base) >> (13 - 2); - tmp |= (*(in + 10) - base) << 2; - tmp |= (*(in + 11) - base) << 15; - tmp |= (*(in + 12) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 12) - base) >> (13 - 9); - tmp |= (*(in + 13) - base) << 9; - tmp |= (*(in + 14) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 14) - base) >> (13 - 3); - tmp |= (*(in + 15) - base) << 3; - tmp |= (*(in + 16) - base) << 16; - tmp |= (*(in + 17) - base) << 29; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 17) - base) >> (13 - 10); - tmp |= (*(in + 18) - base) << 10; - tmp |= (*(in + 19) - base) << 23; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 19) - base) >> (13 - 4); - tmp |= (*(in + 20) - base) << 4; - tmp |= (*(in + 21) - base) << 17; - tmp |= (*(in + 22) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 22) - base) >> (13 - 11); - tmp |= (*(in + 23) - base) << 11; - tmp |= (*(in + 24) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 24) - base) >> (13 - 5); - tmp |= (*(in + 25) - base) << 5; - tmp |= (*(in + 26) - base) << 18; - tmp |= (*(in + 27) - base) << 31; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 27) - base) >> (13 - 12); - tmp |= (*(in + 28) - base) << 12; - tmp |= (*(in + 29) - base) << 25; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 29) - base) >> (13 - 6); - tmp |= (*(in + 30) - base) << 6; - tmp |= (*(in + 31) - base) << 19; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 52) */ - memcpy(out, &tmp, length); - return 52; -} - -static uint32_t -unpack13_32(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 8191); - *(out + 1) = base + ((*in32 >> 13) & 8191); - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 7)) << (13 - 7); - *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 7) & 8191); - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 1)) << (13 - 1); - *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 1) & 8191); - *(out + 6) = base + ((*in32 >> 14) & 8191); - tmp = (*in32 >> 27); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 8)) << (13 - 8); - *(out + 7) = base + tmp; - *(out + 8) = base + ((*in32 >> 8) & 8191); - tmp = (*in32 >> 21); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 2)) << (13 - 2); - *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 2) & 8191); - *(out + 11) = base + ((*in32 >> 15) & 8191); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 9)) << (13 - 9); - *(out + 12) = base + tmp; - *(out + 13) = base + ((*in32 >> 9) & 8191); - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 3)) << (13 - 3); - *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 3) & 8191); - *(out + 16) = base + ((*in32 >> 16) & 8191); - tmp = (*in32 >> 29); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 10)) << (13 - 10); - *(out + 17) = base + tmp; - *(out + 18) = base + ((*in32 >> 10) & 8191); - tmp = (*in32 >> 23); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 4)) << (13 - 4); - *(out + 19) = base + tmp; - *(out + 20) = base + ((*in32 >> 4) & 8191); - *(out + 21) = base + ((*in32 >> 17) & 8191); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 40) */ - tmp |= (*in32 % (1U << 11)) << (13 - 11); - *(out + 22) = base + tmp; - *(out + 23) = base + ((*in32 >> 11) & 8191); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 44) */ - tmp |= (*in32 % (1U << 5)) << (13 - 5); - *(out + 24) = base + tmp; - *(out + 25) = base + ((*in32 >> 5) & 8191); - *(out + 26) = base + ((*in32 >> 18) & 8191); - tmp = (*in32 >> 31); - in32++; - /* consumed: 4 bytes (total: 48) */ - tmp |= (*in32 % (1U << 12)) << (13 - 12); - *(out + 27) = base + tmp; - *(out + 28) = base + ((*in32 >> 12) & 8191); - tmp = (*in32 >> 25); - in32++; - /* consumed: 4 bytes (total: 52) */ - tmp |= (*in32 % (1U << 6)) << (13 - 6); - *(out + 29) = base + tmp; - *(out + 30) = base + ((*in32 >> 6) & 8191); - *(out + 31) = base + ((*in32 >> 19) & 8191); - /* remaining: 0 bits */ - return 52; -} - -static uint32_t -pack14_32(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 14; - tmp |= (*(in + 2) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) >> (14 - 10); - tmp |= (*(in + 3) - base) << 10; - tmp |= (*(in + 4) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 4) - base) >> (14 - 6); - tmp |= (*(in + 5) - base) << 6; - tmp |= (*(in + 6) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 6) - base) >> (14 - 2); - tmp |= (*(in + 7) - base) << 2; - tmp |= (*(in + 8) - base) << 16; - tmp |= (*(in + 9) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 9) - base) >> (14 - 12); - tmp |= (*(in + 10) - base) << 12; - tmp |= (*(in + 11) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 11) - base) >> (14 - 8); - tmp |= (*(in + 12) - base) << 8; - tmp |= (*(in + 13) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 13) - base) >> (14 - 4); - tmp |= (*(in + 14) - base) << 4; - tmp |= (*(in + 15) - base) << 18; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 16) - base) << 0; - tmp |= (*(in + 17) - base) << 14; - tmp |= (*(in + 18) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 18) - base) >> (14 - 10); - tmp |= (*(in + 19) - base) << 10; - tmp |= (*(in + 20) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 20) - base) >> (14 - 6); - tmp |= (*(in + 21) - base) << 6; - tmp |= (*(in + 22) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 22) - base) >> (14 - 2); - tmp |= (*(in + 23) - base) << 2; - tmp |= (*(in + 24) - base) << 16; - tmp |= (*(in + 25) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 25) - base) >> (14 - 12); - tmp |= (*(in + 26) - base) << 12; - tmp |= (*(in + 27) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 27) - base) >> (14 - 8); - tmp |= (*(in + 28) - base) << 8; - tmp |= (*(in + 29) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 29) - base) >> (14 - 4); - tmp |= (*(in + 30) - base) << 4; - tmp |= (*(in + 31) - base) << 18; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 56) */ - memcpy(out, &tmp, length); - return 56; -} - -static uint32_t -unpack14_32(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 16383); - *(out + 1) = base + ((*in32 >> 14) & 16383); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 10)) << (14 - 10); - *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 10) & 16383); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 6)) << (14 - 6); - *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 6) & 16383); - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 2)) << (14 - 2); - *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 2) & 16383); - *(out + 8) = base + ((*in32 >> 16) & 16383); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 12)) << (14 - 12); - *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 12) & 16383); - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 8)) << (14 - 8); - *(out + 11) = base + tmp; - *(out + 12) = base + ((*in32 >> 8) & 16383); - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 4)) << (14 - 4); - *(out + 13) = base + tmp; - *(out + 14) = base + ((*in32 >> 4) & 16383); - *(out + 15) = base + ((*in32 >> 18) & 16383); - in32++; - /* consumed: 4 bytes (total: 32) */ - *(out + 16) = base + ((*in32 >> 0) & 16383); - *(out + 17) = base + ((*in32 >> 14) & 16383); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 10)) << (14 - 10); - *(out + 18) = base + tmp; - *(out + 19) = base + ((*in32 >> 10) & 16383); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 40) */ - tmp |= (*in32 % (1U << 6)) << (14 - 6); - *(out + 20) = base + tmp; - *(out + 21) = base + ((*in32 >> 6) & 16383); - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 44) */ - tmp |= (*in32 % (1U << 2)) << (14 - 2); - *(out + 22) = base + tmp; - *(out + 23) = base + ((*in32 >> 2) & 16383); - *(out + 24) = base + ((*in32 >> 16) & 16383); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 48) */ - tmp |= (*in32 % (1U << 12)) << (14 - 12); - *(out + 25) = base + tmp; - *(out + 26) = base + ((*in32 >> 12) & 16383); - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 52) */ - tmp |= (*in32 % (1U << 8)) << (14 - 8); - *(out + 27) = base + tmp; - *(out + 28) = base + ((*in32 >> 8) & 16383); - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 56) */ - tmp |= (*in32 % (1U << 4)) << (14 - 4); - *(out + 29) = base + tmp; - *(out + 30) = base + ((*in32 >> 4) & 16383); - *(out + 31) = base + ((*in32 >> 18) & 16383); - /* remaining: 0 bits */ - return 56; -} - -static uint32_t -pack15_32(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 15; - tmp |= (*(in + 2) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) >> (15 - 13); - tmp |= (*(in + 3) - base) << 13; - tmp |= (*(in + 4) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 4) - base) >> (15 - 11); - tmp |= (*(in + 5) - base) << 11; - tmp |= (*(in + 6) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 6) - base) >> (15 - 9); - tmp |= (*(in + 7) - base) << 9; - tmp |= (*(in + 8) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 8) - base) >> (15 - 7); - tmp |= (*(in + 9) - base) << 7; - tmp |= (*(in + 10) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 10) - base) >> (15 - 5); - tmp |= (*(in + 11) - base) << 5; - tmp |= (*(in + 12) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 12) - base) >> (15 - 3); - tmp |= (*(in + 13) - base) << 3; - tmp |= (*(in + 14) - base) << 18; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 14) - base) >> (15 - 1); - tmp |= (*(in + 15) - base) << 1; - tmp |= (*(in + 16) - base) << 16; - tmp |= (*(in + 17) - base) << 31; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 17) - base) >> (15 - 14); - tmp |= (*(in + 18) - base) << 14; - tmp |= (*(in + 19) - base) << 29; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 19) - base) >> (15 - 12); - tmp |= (*(in + 20) - base) << 12; - tmp |= (*(in + 21) - base) << 27; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 21) - base) >> (15 - 10); - tmp |= (*(in + 22) - base) << 10; - tmp |= (*(in + 23) - base) << 25; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 23) - base) >> (15 - 8); - tmp |= (*(in + 24) - base) << 8; - tmp |= (*(in + 25) - base) << 23; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 25) - base) >> (15 - 6); - tmp |= (*(in + 26) - base) << 6; - tmp |= (*(in + 27) - base) << 21; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 27) - base) >> (15 - 4); - tmp |= (*(in + 28) - base) << 4; - tmp |= (*(in + 29) - base) << 19; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 29) - base) >> (15 - 2); - tmp |= (*(in + 30) - base) << 2; - tmp |= (*(in + 31) - base) << 17; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 60) */ - memcpy(out, &tmp, length); - return 60; -} - -static uint32_t -unpack15_32(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 32767); - *(out + 1) = base + ((*in32 >> 15) & 32767); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 13)) << (15 - 13); - *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 13) & 32767); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 11)) << (15 - 11); - *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 11) & 32767); - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 9)) << (15 - 9); - *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 9) & 32767); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 7)) << (15 - 7); - *(out + 8) = base + tmp; - *(out + 9) = base + ((*in32 >> 7) & 32767); - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 5)) << (15 - 5); - *(out + 10) = base + tmp; - *(out + 11) = base + ((*in32 >> 5) & 32767); - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 3)) << (15 - 3); - *(out + 12) = base + tmp; - *(out + 13) = base + ((*in32 >> 3) & 32767); - tmp = (*in32 >> 18); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 1)) << (15 - 1); - *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 1) & 32767); - *(out + 16) = base + ((*in32 >> 16) & 32767); - tmp = (*in32 >> 31); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 14)) << (15 - 14); - *(out + 17) = base + tmp; - *(out + 18) = base + ((*in32 >> 14) & 32767); - tmp = (*in32 >> 29); - in32++; - /* consumed: 4 bytes (total: 40) */ - tmp |= (*in32 % (1U << 12)) << (15 - 12); - *(out + 19) = base + tmp; - *(out + 20) = base + ((*in32 >> 12) & 32767); - tmp = (*in32 >> 27); - in32++; - /* consumed: 4 bytes (total: 44) */ - tmp |= (*in32 % (1U << 10)) << (15 - 10); - *(out + 21) = base + tmp; - *(out + 22) = base + ((*in32 >> 10) & 32767); - tmp = (*in32 >> 25); - in32++; - /* consumed: 4 bytes (total: 48) */ - tmp |= (*in32 % (1U << 8)) << (15 - 8); - *(out + 23) = base + tmp; - *(out + 24) = base + ((*in32 >> 8) & 32767); - tmp = (*in32 >> 23); - in32++; - /* consumed: 4 bytes (total: 52) */ - tmp |= (*in32 % (1U << 6)) << (15 - 6); - *(out + 25) = base + tmp; - *(out + 26) = base + ((*in32 >> 6) & 32767); - tmp = (*in32 >> 21); - in32++; - /* consumed: 4 bytes (total: 56) */ - tmp |= (*in32 % (1U << 4)) << (15 - 4); - *(out + 27) = base + tmp; - *(out + 28) = base + ((*in32 >> 4) & 32767); - tmp = (*in32 >> 19); - in32++; - /* consumed: 4 bytes (total: 60) */ - tmp |= (*in32 % (1U << 2)) << (15 - 2); - *(out + 29) = base + tmp; - *(out + 30) = base + ((*in32 >> 2) & 32767); - *(out + 31) = base + ((*in32 >> 17) & 32767); - /* remaining: 0 bits */ - return 60; -} - -static uint32_t -pack16_32(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) << 0; - tmp |= (*(in + 3) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 4) - base) << 0; - tmp |= (*(in + 5) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 6) - base) << 0; - tmp |= (*(in + 7) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 8) - base) << 0; - tmp |= (*(in + 9) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 10) - base) << 0; - tmp |= (*(in + 11) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 12) - base) << 0; - tmp |= (*(in + 13) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 14) - base) << 0; - tmp |= (*(in + 15) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 16) - base) << 0; - tmp |= (*(in + 17) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 18) - base) << 0; - tmp |= (*(in + 19) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 20) - base) << 0; - tmp |= (*(in + 21) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 22) - base) << 0; - tmp |= (*(in + 23) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 24) - base) << 0; - tmp |= (*(in + 25) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 26) - base) << 0; - tmp |= (*(in + 27) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 28) - base) << 0; - tmp |= (*(in + 29) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 60) */ - tmp = (*(in + 30) - base) << 0; - tmp |= (*(in + 31) - base) << 16; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 64) */ - memcpy(out, &tmp, length); - return 64; -} - -static uint32_t -unpack16_32(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 65535); - *(out + 1) = base + ((*in32 >> 16) & 65535); - in32++; - /* consumed: 4 bytes (total: 8) */ - *(out + 2) = base + ((*in32 >> 0) & 65535); - *(out + 3) = base + ((*in32 >> 16) & 65535); - in32++; - /* consumed: 4 bytes (total: 12) */ - *(out + 4) = base + ((*in32 >> 0) & 65535); - *(out + 5) = base + ((*in32 >> 16) & 65535); - in32++; - /* consumed: 4 bytes (total: 16) */ - *(out + 6) = base + ((*in32 >> 0) & 65535); - *(out + 7) = base + ((*in32 >> 16) & 65535); - in32++; - /* consumed: 4 bytes (total: 20) */ - *(out + 8) = base + ((*in32 >> 0) & 65535); - *(out + 9) = base + ((*in32 >> 16) & 65535); - in32++; - /* consumed: 4 bytes (total: 24) */ - *(out + 10) = base + ((*in32 >> 0) & 65535); - *(out + 11) = base + ((*in32 >> 16) & 65535); - in32++; - /* consumed: 4 bytes (total: 28) */ - *(out + 12) = base + ((*in32 >> 0) & 65535); - *(out + 13) = base + ((*in32 >> 16) & 65535); - in32++; - /* consumed: 4 bytes (total: 32) */ - *(out + 14) = base + ((*in32 >> 0) & 65535); - *(out + 15) = base + ((*in32 >> 16) & 65535); - in32++; - /* consumed: 4 bytes (total: 36) */ - *(out + 16) = base + ((*in32 >> 0) & 65535); - *(out + 17) = base + ((*in32 >> 16) & 65535); - in32++; - /* consumed: 4 bytes (total: 40) */ - *(out + 18) = base + ((*in32 >> 0) & 65535); - *(out + 19) = base + ((*in32 >> 16) & 65535); - in32++; - /* consumed: 4 bytes (total: 44) */ - *(out + 20) = base + ((*in32 >> 0) & 65535); - *(out + 21) = base + ((*in32 >> 16) & 65535); - in32++; - /* consumed: 4 bytes (total: 48) */ - *(out + 22) = base + ((*in32 >> 0) & 65535); - *(out + 23) = base + ((*in32 >> 16) & 65535); - in32++; - /* consumed: 4 bytes (total: 52) */ - *(out + 24) = base + ((*in32 >> 0) & 65535); - *(out + 25) = base + ((*in32 >> 16) & 65535); - in32++; - /* consumed: 4 bytes (total: 56) */ - *(out + 26) = base + ((*in32 >> 0) & 65535); - *(out + 27) = base + ((*in32 >> 16) & 65535); - in32++; - /* consumed: 4 bytes (total: 60) */ - *(out + 28) = base + ((*in32 >> 0) & 65535); - *(out + 29) = base + ((*in32 >> 16) & 65535); - in32++; - /* consumed: 4 bytes (total: 64) */ - *(out + 30) = base + ((*in32 >> 0) & 65535); - *(out + 31) = base + ((*in32 >> 16) & 65535); - /* remaining: 0 bits */ - return 64; -} - -static uint32_t -pack17_32(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 17; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (17 - 2); - tmp |= (*(in + 2) - base) << 2; - tmp |= (*(in + 3) - base) << 19; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 3) - base) >> (17 - 4); - tmp |= (*(in + 4) - base) << 4; - tmp |= (*(in + 5) - base) << 21; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 5) - base) >> (17 - 6); - tmp |= (*(in + 6) - base) << 6; - tmp |= (*(in + 7) - base) << 23; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 7) - base) >> (17 - 8); - tmp |= (*(in + 8) - base) << 8; - tmp |= (*(in + 9) - base) << 25; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 9) - base) >> (17 - 10); - tmp |= (*(in + 10) - base) << 10; - tmp |= (*(in + 11) - base) << 27; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 11) - base) >> (17 - 12); - tmp |= (*(in + 12) - base) << 12; - tmp |= (*(in + 13) - base) << 29; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 13) - base) >> (17 - 14); - tmp |= (*(in + 14) - base) << 14; - tmp |= (*(in + 15) - base) << 31; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 15) - base) >> (17 - 16); - tmp |= (*(in + 16) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 16) - base) >> (17 - 1); - tmp |= (*(in + 17) - base) << 1; - tmp |= (*(in + 18) - base) << 18; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 18) - base) >> (17 - 3); - tmp |= (*(in + 19) - base) << 3; - tmp |= (*(in + 20) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 20) - base) >> (17 - 5); - tmp |= (*(in + 21) - base) << 5; - tmp |= (*(in + 22) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 22) - base) >> (17 - 7); - tmp |= (*(in + 23) - base) << 7; - tmp |= (*(in + 24) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 24) - base) >> (17 - 9); - tmp |= (*(in + 25) - base) << 9; - tmp |= (*(in + 26) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 26) - base) >> (17 - 11); - tmp |= (*(in + 27) - base) << 11; - tmp |= (*(in + 28) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 60) */ - tmp = (*(in + 28) - base) >> (17 - 13); - tmp |= (*(in + 29) - base) << 13; - tmp |= (*(in + 30) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 64) */ - tmp = (*(in + 30) - base) >> (17 - 15); - tmp |= (*(in + 31) - base) << 15; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 68) */ - memcpy(out, &tmp, length); - return 68; -} - -static uint32_t -unpack17_32(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 131071); - tmp = (*in32 >> 17); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 2)) << (17 - 2); - *(out + 1) = base + tmp; - *(out + 2) = base + ((*in32 >> 2) & 131071); - tmp = (*in32 >> 19); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 4)) << (17 - 4); - *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 4) & 131071); - tmp = (*in32 >> 21); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 6)) << (17 - 6); - *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 6) & 131071); - tmp = (*in32 >> 23); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 8)) << (17 - 8); - *(out + 7) = base + tmp; - *(out + 8) = base + ((*in32 >> 8) & 131071); - tmp = (*in32 >> 25); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 10)) << (17 - 10); - *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 10) & 131071); - tmp = (*in32 >> 27); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 12)) << (17 - 12); - *(out + 11) = base + tmp; - *(out + 12) = base + ((*in32 >> 12) & 131071); - tmp = (*in32 >> 29); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 14)) << (17 - 14); - *(out + 13) = base + tmp; - *(out + 14) = base + ((*in32 >> 14) & 131071); - tmp = (*in32 >> 31); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 16)) << (17 - 16); - *(out + 15) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 40) */ - tmp |= (*in32 % (1U << 1)) << (17 - 1); - *(out + 16) = base + tmp; - *(out + 17) = base + ((*in32 >> 1) & 131071); - tmp = (*in32 >> 18); - in32++; - /* consumed: 4 bytes (total: 44) */ - tmp |= (*in32 % (1U << 3)) << (17 - 3); - *(out + 18) = base + tmp; - *(out + 19) = base + ((*in32 >> 3) & 131071); - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 48) */ - tmp |= (*in32 % (1U << 5)) << (17 - 5); - *(out + 20) = base + tmp; - *(out + 21) = base + ((*in32 >> 5) & 131071); - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 52) */ - tmp |= (*in32 % (1U << 7)) << (17 - 7); - *(out + 22) = base + tmp; - *(out + 23) = base + ((*in32 >> 7) & 131071); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 56) */ - tmp |= (*in32 % (1U << 9)) << (17 - 9); - *(out + 24) = base + tmp; - *(out + 25) = base + ((*in32 >> 9) & 131071); - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 60) */ - tmp |= (*in32 % (1U << 11)) << (17 - 11); - *(out + 26) = base + tmp; - *(out + 27) = base + ((*in32 >> 11) & 131071); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 64) */ - tmp |= (*in32 % (1U << 13)) << (17 - 13); - *(out + 28) = base + tmp; - *(out + 29) = base + ((*in32 >> 13) & 131071); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 68) */ - tmp |= (*in32 % (1U << 15)) << (17 - 15); - *(out + 30) = base + tmp; - *(out + 31) = base + ((*in32 >> 15) & 131071); - /* remaining: 0 bits */ - return 68; -} - -static uint32_t -pack18_32(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 18; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (18 - 4); - tmp |= (*(in + 2) - base) << 4; - tmp |= (*(in + 3) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 3) - base) >> (18 - 8); - tmp |= (*(in + 4) - base) << 8; - tmp |= (*(in + 5) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 5) - base) >> (18 - 12); - tmp |= (*(in + 6) - base) << 12; - tmp |= (*(in + 7) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 7) - base) >> (18 - 16); - tmp |= (*(in + 8) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 8) - base) >> (18 - 2); - tmp |= (*(in + 9) - base) << 2; - tmp |= (*(in + 10) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 10) - base) >> (18 - 6); - tmp |= (*(in + 11) - base) << 6; - tmp |= (*(in + 12) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 12) - base) >> (18 - 10); - tmp |= (*(in + 13) - base) << 10; - tmp |= (*(in + 14) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 14) - base) >> (18 - 14); - tmp |= (*(in + 15) - base) << 14; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 16) - base) << 0; - tmp |= (*(in + 17) - base) << 18; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 17) - base) >> (18 - 4); - tmp |= (*(in + 18) - base) << 4; - tmp |= (*(in + 19) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 19) - base) >> (18 - 8); - tmp |= (*(in + 20) - base) << 8; - tmp |= (*(in + 21) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 21) - base) >> (18 - 12); - tmp |= (*(in + 22) - base) << 12; - tmp |= (*(in + 23) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 23) - base) >> (18 - 16); - tmp |= (*(in + 24) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 24) - base) >> (18 - 2); - tmp |= (*(in + 25) - base) << 2; - tmp |= (*(in + 26) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 60) */ - tmp = (*(in + 26) - base) >> (18 - 6); - tmp |= (*(in + 27) - base) << 6; - tmp |= (*(in + 28) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 64) */ - tmp = (*(in + 28) - base) >> (18 - 10); - tmp |= (*(in + 29) - base) << 10; - tmp |= (*(in + 30) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 68) */ - tmp = (*(in + 30) - base) >> (18 - 14); - tmp |= (*(in + 31) - base) << 14; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 72) */ - memcpy(out, &tmp, length); - return 72; -} - -static uint32_t -unpack18_32(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 262143); - tmp = (*in32 >> 18); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 4)) << (18 - 4); - *(out + 1) = base + tmp; - *(out + 2) = base + ((*in32 >> 4) & 262143); - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 8)) << (18 - 8); - *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 8) & 262143); - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 12)) << (18 - 12); - *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 12) & 262143); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 16)) << (18 - 16); - *(out + 7) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 2)) << (18 - 2); - *(out + 8) = base + tmp; - *(out + 9) = base + ((*in32 >> 2) & 262143); - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 6)) << (18 - 6); - *(out + 10) = base + tmp; - *(out + 11) = base + ((*in32 >> 6) & 262143); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 10)) << (18 - 10); - *(out + 12) = base + tmp; - *(out + 13) = base + ((*in32 >> 10) & 262143); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 14)) << (18 - 14); - *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 14) & 262143); - in32++; - /* consumed: 4 bytes (total: 40) */ - *(out + 16) = base + ((*in32 >> 0) & 262143); - tmp = (*in32 >> 18); - in32++; - /* consumed: 4 bytes (total: 44) */ - tmp |= (*in32 % (1U << 4)) << (18 - 4); - *(out + 17) = base + tmp; - *(out + 18) = base + ((*in32 >> 4) & 262143); - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 48) */ - tmp |= (*in32 % (1U << 8)) << (18 - 8); - *(out + 19) = base + tmp; - *(out + 20) = base + ((*in32 >> 8) & 262143); - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 52) */ - tmp |= (*in32 % (1U << 12)) << (18 - 12); - *(out + 21) = base + tmp; - *(out + 22) = base + ((*in32 >> 12) & 262143); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 56) */ - tmp |= (*in32 % (1U << 16)) << (18 - 16); - *(out + 23) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 60) */ - tmp |= (*in32 % (1U << 2)) << (18 - 2); - *(out + 24) = base + tmp; - *(out + 25) = base + ((*in32 >> 2) & 262143); - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 64) */ - tmp |= (*in32 % (1U << 6)) << (18 - 6); - *(out + 26) = base + tmp; - *(out + 27) = base + ((*in32 >> 6) & 262143); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 68) */ - tmp |= (*in32 % (1U << 10)) << (18 - 10); - *(out + 28) = base + tmp; - *(out + 29) = base + ((*in32 >> 10) & 262143); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 72) */ - tmp |= (*in32 % (1U << 14)) << (18 - 14); - *(out + 30) = base + tmp; - *(out + 31) = base + ((*in32 >> 14) & 262143); - /* remaining: 0 bits */ - return 72; -} - -static uint32_t -pack19_32(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 19; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (19 - 6); - tmp |= (*(in + 2) - base) << 6; - tmp |= (*(in + 3) - base) << 25; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 3) - base) >> (19 - 12); - tmp |= (*(in + 4) - base) << 12; - tmp |= (*(in + 5) - base) << 31; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 5) - base) >> (19 - 18); - tmp |= (*(in + 6) - base) << 18; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 6) - base) >> (19 - 5); - tmp |= (*(in + 7) - base) << 5; - tmp |= (*(in + 8) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 8) - base) >> (19 - 11); - tmp |= (*(in + 9) - base) << 11; - tmp |= (*(in + 10) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 10) - base) >> (19 - 17); - tmp |= (*(in + 11) - base) << 17; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 11) - base) >> (19 - 4); - tmp |= (*(in + 12) - base) << 4; - tmp |= (*(in + 13) - base) << 23; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 13) - base) >> (19 - 10); - tmp |= (*(in + 14) - base) << 10; - tmp |= (*(in + 15) - base) << 29; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 15) - base) >> (19 - 16); - tmp |= (*(in + 16) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 16) - base) >> (19 - 3); - tmp |= (*(in + 17) - base) << 3; - tmp |= (*(in + 18) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 18) - base) >> (19 - 9); - tmp |= (*(in + 19) - base) << 9; - tmp |= (*(in + 20) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 20) - base) >> (19 - 15); - tmp |= (*(in + 21) - base) << 15; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 21) - base) >> (19 - 2); - tmp |= (*(in + 22) - base) << 2; - tmp |= (*(in + 23) - base) << 21; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 23) - base) >> (19 - 8); - tmp |= (*(in + 24) - base) << 8; - tmp |= (*(in + 25) - base) << 27; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 60) */ - tmp = (*(in + 25) - base) >> (19 - 14); - tmp |= (*(in + 26) - base) << 14; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 64) */ - tmp = (*(in + 26) - base) >> (19 - 1); - tmp |= (*(in + 27) - base) << 1; - tmp |= (*(in + 28) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 68) */ - tmp = (*(in + 28) - base) >> (19 - 7); - tmp |= (*(in + 29) - base) << 7; - tmp |= (*(in + 30) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 72) */ - tmp = (*(in + 30) - base) >> (19 - 13); - tmp |= (*(in + 31) - base) << 13; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 76) */ - memcpy(out, &tmp, length); - return 76; -} - -static uint32_t -unpack19_32(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 524287); - tmp = (*in32 >> 19); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 6)) << (19 - 6); - *(out + 1) = base + tmp; - *(out + 2) = base + ((*in32 >> 6) & 524287); - tmp = (*in32 >> 25); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 12)) << (19 - 12); - *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 12) & 524287); - tmp = (*in32 >> 31); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 18)) << (19 - 18); - *(out + 5) = base + tmp; - tmp = (*in32 >> 18); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 5)) << (19 - 5); - *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 5) & 524287); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 11)) << (19 - 11); - *(out + 8) = base + tmp; - *(out + 9) = base + ((*in32 >> 11) & 524287); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 17)) << (19 - 17); - *(out + 10) = base + tmp; - tmp = (*in32 >> 17); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 4)) << (19 - 4); - *(out + 11) = base + tmp; - *(out + 12) = base + ((*in32 >> 4) & 524287); - tmp = (*in32 >> 23); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 10)) << (19 - 10); - *(out + 13) = base + tmp; - *(out + 14) = base + ((*in32 >> 10) & 524287); - tmp = (*in32 >> 29); - in32++; - /* consumed: 4 bytes (total: 40) */ - tmp |= (*in32 % (1U << 16)) << (19 - 16); - *(out + 15) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 44) */ - tmp |= (*in32 % (1U << 3)) << (19 - 3); - *(out + 16) = base + tmp; - *(out + 17) = base + ((*in32 >> 3) & 524287); - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 48) */ - tmp |= (*in32 % (1U << 9)) << (19 - 9); - *(out + 18) = base + tmp; - *(out + 19) = base + ((*in32 >> 9) & 524287); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 52) */ - tmp |= (*in32 % (1U << 15)) << (19 - 15); - *(out + 20) = base + tmp; - tmp = (*in32 >> 15); - in32++; - /* consumed: 4 bytes (total: 56) */ - tmp |= (*in32 % (1U << 2)) << (19 - 2); - *(out + 21) = base + tmp; - *(out + 22) = base + ((*in32 >> 2) & 524287); - tmp = (*in32 >> 21); - in32++; - /* consumed: 4 bytes (total: 60) */ - tmp |= (*in32 % (1U << 8)) << (19 - 8); - *(out + 23) = base + tmp; - *(out + 24) = base + ((*in32 >> 8) & 524287); - tmp = (*in32 >> 27); - in32++; - /* consumed: 4 bytes (total: 64) */ - tmp |= (*in32 % (1U << 14)) << (19 - 14); - *(out + 25) = base + tmp; - tmp = (*in32 >> 14); - in32++; - /* consumed: 4 bytes (total: 68) */ - tmp |= (*in32 % (1U << 1)) << (19 - 1); - *(out + 26) = base + tmp; - *(out + 27) = base + ((*in32 >> 1) & 524287); - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 72) */ - tmp |= (*in32 % (1U << 7)) << (19 - 7); - *(out + 28) = base + tmp; - *(out + 29) = base + ((*in32 >> 7) & 524287); - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 76) */ - tmp |= (*in32 % (1U << 13)) << (19 - 13); - *(out + 30) = base + tmp; - *(out + 31) = base + ((*in32 >> 13) & 524287); - /* remaining: 0 bits */ - return 76; -} - -static uint32_t -pack20_32(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (20 - 8); - tmp |= (*(in + 2) - base) << 8; - tmp |= (*(in + 3) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 3) - base) >> (20 - 16); - tmp |= (*(in + 4) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 4) - base) >> (20 - 4); - tmp |= (*(in + 5) - base) << 4; - tmp |= (*(in + 6) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 6) - base) >> (20 - 12); - tmp |= (*(in + 7) - base) << 12; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 8) - base) << 0; - tmp |= (*(in + 9) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 9) - base) >> (20 - 8); - tmp |= (*(in + 10) - base) << 8; - tmp |= (*(in + 11) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 11) - base) >> (20 - 16); - tmp |= (*(in + 12) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 12) - base) >> (20 - 4); - tmp |= (*(in + 13) - base) << 4; - tmp |= (*(in + 14) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 14) - base) >> (20 - 12); - tmp |= (*(in + 15) - base) << 12; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 16) - base) << 0; - tmp |= (*(in + 17) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 17) - base) >> (20 - 8); - tmp |= (*(in + 18) - base) << 8; - tmp |= (*(in + 19) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 19) - base) >> (20 - 16); - tmp |= (*(in + 20) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 20) - base) >> (20 - 4); - tmp |= (*(in + 21) - base) << 4; - tmp |= (*(in + 22) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 22) - base) >> (20 - 12); - tmp |= (*(in + 23) - base) << 12; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 60) */ - tmp = (*(in + 24) - base) << 0; - tmp |= (*(in + 25) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 64) */ - tmp = (*(in + 25) - base) >> (20 - 8); - tmp |= (*(in + 26) - base) << 8; - tmp |= (*(in + 27) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 68) */ - tmp = (*(in + 27) - base) >> (20 - 16); - tmp |= (*(in + 28) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 72) */ - tmp = (*(in + 28) - base) >> (20 - 4); - tmp |= (*(in + 29) - base) << 4; - tmp |= (*(in + 30) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 76) */ - tmp = (*(in + 30) - base) >> (20 - 12); - tmp |= (*(in + 31) - base) << 12; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 80) */ - memcpy(out, &tmp, length); - return 80; -} - -static uint32_t -unpack20_32(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 1048575); - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 8)) << (20 - 8); - *(out + 1) = base + tmp; - *(out + 2) = base + ((*in32 >> 8) & 1048575); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 16)) << (20 - 16); - *(out + 3) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 4)) << (20 - 4); - *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 4) & 1048575); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 12)) << (20 - 12); - *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 12) & 1048575); - in32++; - /* consumed: 4 bytes (total: 24) */ - *(out + 8) = base + ((*in32 >> 0) & 1048575); - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 8)) << (20 - 8); - *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 8) & 1048575); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 16)) << (20 - 16); - *(out + 11) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 4)) << (20 - 4); - *(out + 12) = base + tmp; - *(out + 13) = base + ((*in32 >> 4) & 1048575); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 40) */ - tmp |= (*in32 % (1U << 12)) << (20 - 12); - *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 12) & 1048575); - in32++; - /* consumed: 4 bytes (total: 44) */ - *(out + 16) = base + ((*in32 >> 0) & 1048575); - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 48) */ - tmp |= (*in32 % (1U << 8)) << (20 - 8); - *(out + 17) = base + tmp; - *(out + 18) = base + ((*in32 >> 8) & 1048575); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 52) */ - tmp |= (*in32 % (1U << 16)) << (20 - 16); - *(out + 19) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 56) */ - tmp |= (*in32 % (1U << 4)) << (20 - 4); - *(out + 20) = base + tmp; - *(out + 21) = base + ((*in32 >> 4) & 1048575); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 60) */ - tmp |= (*in32 % (1U << 12)) << (20 - 12); - *(out + 22) = base + tmp; - *(out + 23) = base + ((*in32 >> 12) & 1048575); - in32++; - /* consumed: 4 bytes (total: 64) */ - *(out + 24) = base + ((*in32 >> 0) & 1048575); - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 68) */ - tmp |= (*in32 % (1U << 8)) << (20 - 8); - *(out + 25) = base + tmp; - *(out + 26) = base + ((*in32 >> 8) & 1048575); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 72) */ - tmp |= (*in32 % (1U << 16)) << (20 - 16); - *(out + 27) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 76) */ - tmp |= (*in32 % (1U << 4)) << (20 - 4); - *(out + 28) = base + tmp; - *(out + 29) = base + ((*in32 >> 4) & 1048575); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 80) */ - tmp |= (*in32 % (1U << 12)) << (20 - 12); - *(out + 30) = base + tmp; - *(out + 31) = base + ((*in32 >> 12) & 1048575); - /* remaining: 0 bits */ - return 80; -} - -static uint32_t -pack21_32(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 21; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (21 - 10); - tmp |= (*(in + 2) - base) << 10; - tmp |= (*(in + 3) - base) << 31; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 3) - base) >> (21 - 20); - tmp |= (*(in + 4) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 4) - base) >> (21 - 9); - tmp |= (*(in + 5) - base) << 9; - tmp |= (*(in + 6) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 6) - base) >> (21 - 19); - tmp |= (*(in + 7) - base) << 19; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 7) - base) >> (21 - 8); - tmp |= (*(in + 8) - base) << 8; - tmp |= (*(in + 9) - base) << 29; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 9) - base) >> (21 - 18); - tmp |= (*(in + 10) - base) << 18; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 10) - base) >> (21 - 7); - tmp |= (*(in + 11) - base) << 7; - tmp |= (*(in + 12) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 12) - base) >> (21 - 17); - tmp |= (*(in + 13) - base) << 17; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 13) - base) >> (21 - 6); - tmp |= (*(in + 14) - base) << 6; - tmp |= (*(in + 15) - base) << 27; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 15) - base) >> (21 - 16); - tmp |= (*(in + 16) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 16) - base) >> (21 - 5); - tmp |= (*(in + 17) - base) << 5; - tmp |= (*(in + 18) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 18) - base) >> (21 - 15); - tmp |= (*(in + 19) - base) << 15; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 19) - base) >> (21 - 4); - tmp |= (*(in + 20) - base) << 4; - tmp |= (*(in + 21) - base) << 25; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 21) - base) >> (21 - 14); - tmp |= (*(in + 22) - base) << 14; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 60) */ - tmp = (*(in + 22) - base) >> (21 - 3); - tmp |= (*(in + 23) - base) << 3; - tmp |= (*(in + 24) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 64) */ - tmp = (*(in + 24) - base) >> (21 - 13); - tmp |= (*(in + 25) - base) << 13; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 68) */ - tmp = (*(in + 25) - base) >> (21 - 2); - tmp |= (*(in + 26) - base) << 2; - tmp |= (*(in + 27) - base) << 23; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 72) */ - tmp = (*(in + 27) - base) >> (21 - 12); - tmp |= (*(in + 28) - base) << 12; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 76) */ - tmp = (*(in + 28) - base) >> (21 - 1); - tmp |= (*(in + 29) - base) << 1; - tmp |= (*(in + 30) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 80) */ - tmp = (*(in + 30) - base) >> (21 - 11); - tmp |= (*(in + 31) - base) << 11; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 84) */ - memcpy(out, &tmp, length); - return 84; -} - -static uint32_t -unpack21_32(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 2097151); - tmp = (*in32 >> 21); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 10)) << (21 - 10); - *(out + 1) = base + tmp; - *(out + 2) = base + ((*in32 >> 10) & 2097151); - tmp = (*in32 >> 31); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 20)) << (21 - 20); - *(out + 3) = base + tmp; - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 9)) << (21 - 9); - *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 9) & 2097151); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 19)) << (21 - 19); - *(out + 6) = base + tmp; - tmp = (*in32 >> 19); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 8)) << (21 - 8); - *(out + 7) = base + tmp; - *(out + 8) = base + ((*in32 >> 8) & 2097151); - tmp = (*in32 >> 29); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 18)) << (21 - 18); - *(out + 9) = base + tmp; - tmp = (*in32 >> 18); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 7)) << (21 - 7); - *(out + 10) = base + tmp; - *(out + 11) = base + ((*in32 >> 7) & 2097151); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 17)) << (21 - 17); - *(out + 12) = base + tmp; - tmp = (*in32 >> 17); - in32++; - /* consumed: 4 bytes (total: 40) */ - tmp |= (*in32 % (1U << 6)) << (21 - 6); - *(out + 13) = base + tmp; - *(out + 14) = base + ((*in32 >> 6) & 2097151); - tmp = (*in32 >> 27); - in32++; - /* consumed: 4 bytes (total: 44) */ - tmp |= (*in32 % (1U << 16)) << (21 - 16); - *(out + 15) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 48) */ - tmp |= (*in32 % (1U << 5)) << (21 - 5); - *(out + 16) = base + tmp; - *(out + 17) = base + ((*in32 >> 5) & 2097151); - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 52) */ - tmp |= (*in32 % (1U << 15)) << (21 - 15); - *(out + 18) = base + tmp; - tmp = (*in32 >> 15); - in32++; - /* consumed: 4 bytes (total: 56) */ - tmp |= (*in32 % (1U << 4)) << (21 - 4); - *(out + 19) = base + tmp; - *(out + 20) = base + ((*in32 >> 4) & 2097151); - tmp = (*in32 >> 25); - in32++; - /* consumed: 4 bytes (total: 60) */ - tmp |= (*in32 % (1U << 14)) << (21 - 14); - *(out + 21) = base + tmp; - tmp = (*in32 >> 14); - in32++; - /* consumed: 4 bytes (total: 64) */ - tmp |= (*in32 % (1U << 3)) << (21 - 3); - *(out + 22) = base + tmp; - *(out + 23) = base + ((*in32 >> 3) & 2097151); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 68) */ - tmp |= (*in32 % (1U << 13)) << (21 - 13); - *(out + 24) = base + tmp; - tmp = (*in32 >> 13); - in32++; - /* consumed: 4 bytes (total: 72) */ - tmp |= (*in32 % (1U << 2)) << (21 - 2); - *(out + 25) = base + tmp; - *(out + 26) = base + ((*in32 >> 2) & 2097151); - tmp = (*in32 >> 23); - in32++; - /* consumed: 4 bytes (total: 76) */ - tmp |= (*in32 % (1U << 12)) << (21 - 12); - *(out + 27) = base + tmp; - tmp = (*in32 >> 12); - in32++; - /* consumed: 4 bytes (total: 80) */ - tmp |= (*in32 % (1U << 1)) << (21 - 1); - *(out + 28) = base + tmp; - *(out + 29) = base + ((*in32 >> 1) & 2097151); - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 84) */ - tmp |= (*in32 % (1U << 11)) << (21 - 11); - *(out + 30) = base + tmp; - *(out + 31) = base + ((*in32 >> 11) & 2097151); - /* remaining: 0 bits */ - return 84; -} - -static uint32_t -pack22_32(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (22 - 12); - tmp |= (*(in + 2) - base) << 12; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (22 - 2); - tmp |= (*(in + 3) - base) << 2; - tmp |= (*(in + 4) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 4) - base) >> (22 - 14); - tmp |= (*(in + 5) - base) << 14; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 5) - base) >> (22 - 4); - tmp |= (*(in + 6) - base) << 4; - tmp |= (*(in + 7) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 7) - base) >> (22 - 16); - tmp |= (*(in + 8) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 8) - base) >> (22 - 6); - tmp |= (*(in + 9) - base) << 6; - tmp |= (*(in + 10) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 10) - base) >> (22 - 18); - tmp |= (*(in + 11) - base) << 18; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 11) - base) >> (22 - 8); - tmp |= (*(in + 12) - base) << 8; - tmp |= (*(in + 13) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 13) - base) >> (22 - 20); - tmp |= (*(in + 14) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 14) - base) >> (22 - 10); - tmp |= (*(in + 15) - base) << 10; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 16) - base) << 0; - tmp |= (*(in + 17) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 17) - base) >> (22 - 12); - tmp |= (*(in + 18) - base) << 12; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 18) - base) >> (22 - 2); - tmp |= (*(in + 19) - base) << 2; - tmp |= (*(in + 20) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 20) - base) >> (22 - 14); - tmp |= (*(in + 21) - base) << 14; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 60) */ - tmp = (*(in + 21) - base) >> (22 - 4); - tmp |= (*(in + 22) - base) << 4; - tmp |= (*(in + 23) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 64) */ - tmp = (*(in + 23) - base) >> (22 - 16); - tmp |= (*(in + 24) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 68) */ - tmp = (*(in + 24) - base) >> (22 - 6); - tmp |= (*(in + 25) - base) << 6; - tmp |= (*(in + 26) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 72) */ - tmp = (*(in + 26) - base) >> (22 - 18); - tmp |= (*(in + 27) - base) << 18; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 76) */ - tmp = (*(in + 27) - base) >> (22 - 8); - tmp |= (*(in + 28) - base) << 8; - tmp |= (*(in + 29) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 80) */ - tmp = (*(in + 29) - base) >> (22 - 20); - tmp |= (*(in + 30) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 84) */ - tmp = (*(in + 30) - base) >> (22 - 10); - tmp |= (*(in + 31) - base) << 10; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 88) */ - memcpy(out, &tmp, length); - return 88; -} - -static uint32_t -unpack22_32(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 4194303); - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 12)) << (22 - 12); - *(out + 1) = base + tmp; - tmp = (*in32 >> 12); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 2)) << (22 - 2); - *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 2) & 4194303); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 14)) << (22 - 14); - *(out + 4) = base + tmp; - tmp = (*in32 >> 14); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 4)) << (22 - 4); - *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 4) & 4194303); - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 16)) << (22 - 16); - *(out + 7) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 6)) << (22 - 6); - *(out + 8) = base + tmp; - *(out + 9) = base + ((*in32 >> 6) & 4194303); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 18)) << (22 - 18); - *(out + 10) = base + tmp; - tmp = (*in32 >> 18); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 8)) << (22 - 8); - *(out + 11) = base + tmp; - *(out + 12) = base + ((*in32 >> 8) & 4194303); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 40) */ - tmp |= (*in32 % (1U << 20)) << (22 - 20); - *(out + 13) = base + tmp; - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 44) */ - tmp |= (*in32 % (1U << 10)) << (22 - 10); - *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 10) & 4194303); - in32++; - /* consumed: 4 bytes (total: 48) */ - *(out + 16) = base + ((*in32 >> 0) & 4194303); - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 52) */ - tmp |= (*in32 % (1U << 12)) << (22 - 12); - *(out + 17) = base + tmp; - tmp = (*in32 >> 12); - in32++; - /* consumed: 4 bytes (total: 56) */ - tmp |= (*in32 % (1U << 2)) << (22 - 2); - *(out + 18) = base + tmp; - *(out + 19) = base + ((*in32 >> 2) & 4194303); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 60) */ - tmp |= (*in32 % (1U << 14)) << (22 - 14); - *(out + 20) = base + tmp; - tmp = (*in32 >> 14); - in32++; - /* consumed: 4 bytes (total: 64) */ - tmp |= (*in32 % (1U << 4)) << (22 - 4); - *(out + 21) = base + tmp; - *(out + 22) = base + ((*in32 >> 4) & 4194303); - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 68) */ - tmp |= (*in32 % (1U << 16)) << (22 - 16); - *(out + 23) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 72) */ - tmp |= (*in32 % (1U << 6)) << (22 - 6); - *(out + 24) = base + tmp; - *(out + 25) = base + ((*in32 >> 6) & 4194303); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 76) */ - tmp |= (*in32 % (1U << 18)) << (22 - 18); - *(out + 26) = base + tmp; - tmp = (*in32 >> 18); - in32++; - /* consumed: 4 bytes (total: 80) */ - tmp |= (*in32 % (1U << 8)) << (22 - 8); - *(out + 27) = base + tmp; - *(out + 28) = base + ((*in32 >> 8) & 4194303); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 84) */ - tmp |= (*in32 % (1U << 20)) << (22 - 20); - *(out + 29) = base + tmp; - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 88) */ - tmp |= (*in32 % (1U << 10)) << (22 - 10); - *(out + 30) = base + tmp; - *(out + 31) = base + ((*in32 >> 10) & 4194303); - /* remaining: 0 bits */ - return 88; -} - -static uint32_t -pack23_32(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 23; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (23 - 14); - tmp |= (*(in + 2) - base) << 14; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (23 - 5); - tmp |= (*(in + 3) - base) << 5; - tmp |= (*(in + 4) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 4) - base) >> (23 - 19); - tmp |= (*(in + 5) - base) << 19; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 5) - base) >> (23 - 10); - tmp |= (*(in + 6) - base) << 10; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 6) - base) >> (23 - 1); - tmp |= (*(in + 7) - base) << 1; - tmp |= (*(in + 8) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 8) - base) >> (23 - 15); - tmp |= (*(in + 9) - base) << 15; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 9) - base) >> (23 - 6); - tmp |= (*(in + 10) - base) << 6; - tmp |= (*(in + 11) - base) << 29; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 11) - base) >> (23 - 20); - tmp |= (*(in + 12) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 12) - base) >> (23 - 11); - tmp |= (*(in + 13) - base) << 11; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 13) - base) >> (23 - 2); - tmp |= (*(in + 14) - base) << 2; - tmp |= (*(in + 15) - base) << 25; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 15) - base) >> (23 - 16); - tmp |= (*(in + 16) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 16) - base) >> (23 - 7); - tmp |= (*(in + 17) - base) << 7; - tmp |= (*(in + 18) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 18) - base) >> (23 - 21); - tmp |= (*(in + 19) - base) << 21; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 19) - base) >> (23 - 12); - tmp |= (*(in + 20) - base) << 12; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 60) */ - tmp = (*(in + 20) - base) >> (23 - 3); - tmp |= (*(in + 21) - base) << 3; - tmp |= (*(in + 22) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 64) */ - tmp = (*(in + 22) - base) >> (23 - 17); - tmp |= (*(in + 23) - base) << 17; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 68) */ - tmp = (*(in + 23) - base) >> (23 - 8); - tmp |= (*(in + 24) - base) << 8; - tmp |= (*(in + 25) - base) << 31; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 72) */ - tmp = (*(in + 25) - base) >> (23 - 22); - tmp |= (*(in + 26) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 76) */ - tmp = (*(in + 26) - base) >> (23 - 13); - tmp |= (*(in + 27) - base) << 13; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 80) */ - tmp = (*(in + 27) - base) >> (23 - 4); - tmp |= (*(in + 28) - base) << 4; - tmp |= (*(in + 29) - base) << 27; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 84) */ - tmp = (*(in + 29) - base) >> (23 - 18); - tmp |= (*(in + 30) - base) << 18; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 88) */ - tmp = (*(in + 30) - base) >> (23 - 9); - tmp |= (*(in + 31) - base) << 9; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 92) */ - memcpy(out, &tmp, length); - return 92; -} - -static uint32_t -unpack23_32(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 8388607); - tmp = (*in32 >> 23); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 14)) << (23 - 14); - *(out + 1) = base + tmp; - tmp = (*in32 >> 14); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 5)) << (23 - 5); - *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 5) & 8388607); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 19)) << (23 - 19); - *(out + 4) = base + tmp; - tmp = (*in32 >> 19); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 10)) << (23 - 10); - *(out + 5) = base + tmp; - tmp = (*in32 >> 10); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 1)) << (23 - 1); - *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 1) & 8388607); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 15)) << (23 - 15); - *(out + 8) = base + tmp; - tmp = (*in32 >> 15); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 6)) << (23 - 6); - *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 6) & 8388607); - tmp = (*in32 >> 29); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 20)) << (23 - 20); - *(out + 11) = base + tmp; - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 40) */ - tmp |= (*in32 % (1U << 11)) << (23 - 11); - *(out + 12) = base + tmp; - tmp = (*in32 >> 11); - in32++; - /* consumed: 4 bytes (total: 44) */ - tmp |= (*in32 % (1U << 2)) << (23 - 2); - *(out + 13) = base + tmp; - *(out + 14) = base + ((*in32 >> 2) & 8388607); - tmp = (*in32 >> 25); - in32++; - /* consumed: 4 bytes (total: 48) */ - tmp |= (*in32 % (1U << 16)) << (23 - 16); - *(out + 15) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 52) */ - tmp |= (*in32 % (1U << 7)) << (23 - 7); - *(out + 16) = base + tmp; - *(out + 17) = base + ((*in32 >> 7) & 8388607); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 56) */ - tmp |= (*in32 % (1U << 21)) << (23 - 21); - *(out + 18) = base + tmp; - tmp = (*in32 >> 21); - in32++; - /* consumed: 4 bytes (total: 60) */ - tmp |= (*in32 % (1U << 12)) << (23 - 12); - *(out + 19) = base + tmp; - tmp = (*in32 >> 12); - in32++; - /* consumed: 4 bytes (total: 64) */ - tmp |= (*in32 % (1U << 3)) << (23 - 3); - *(out + 20) = base + tmp; - *(out + 21) = base + ((*in32 >> 3) & 8388607); - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 68) */ - tmp |= (*in32 % (1U << 17)) << (23 - 17); - *(out + 22) = base + tmp; - tmp = (*in32 >> 17); - in32++; - /* consumed: 4 bytes (total: 72) */ - tmp |= (*in32 % (1U << 8)) << (23 - 8); - *(out + 23) = base + tmp; - *(out + 24) = base + ((*in32 >> 8) & 8388607); - tmp = (*in32 >> 31); - in32++; - /* consumed: 4 bytes (total: 76) */ - tmp |= (*in32 % (1U << 22)) << (23 - 22); - *(out + 25) = base + tmp; - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 80) */ - tmp |= (*in32 % (1U << 13)) << (23 - 13); - *(out + 26) = base + tmp; - tmp = (*in32 >> 13); - in32++; - /* consumed: 4 bytes (total: 84) */ - tmp |= (*in32 % (1U << 4)) << (23 - 4); - *(out + 27) = base + tmp; - *(out + 28) = base + ((*in32 >> 4) & 8388607); - tmp = (*in32 >> 27); - in32++; - /* consumed: 4 bytes (total: 88) */ - tmp |= (*in32 % (1U << 18)) << (23 - 18); - *(out + 29) = base + tmp; - tmp = (*in32 >> 18); - in32++; - /* consumed: 4 bytes (total: 92) */ - tmp |= (*in32 % (1U << 9)) << (23 - 9); - *(out + 30) = base + tmp; - *(out + 31) = base + ((*in32 >> 9) & 8388607); - /* remaining: 0 bits */ - return 92; -} - -static uint32_t -pack24_32(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (24 - 16); - tmp |= (*(in + 2) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (24 - 8); - tmp |= (*(in + 3) - base) << 8; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 4) - base) << 0; - tmp |= (*(in + 5) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 5) - base) >> (24 - 16); - tmp |= (*(in + 6) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 6) - base) >> (24 - 8); - tmp |= (*(in + 7) - base) << 8; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 8) - base) << 0; - tmp |= (*(in + 9) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 9) - base) >> (24 - 16); - tmp |= (*(in + 10) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 10) - base) >> (24 - 8); - tmp |= (*(in + 11) - base) << 8; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 12) - base) << 0; - tmp |= (*(in + 13) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 13) - base) >> (24 - 16); - tmp |= (*(in + 14) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 14) - base) >> (24 - 8); - tmp |= (*(in + 15) - base) << 8; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 16) - base) << 0; - tmp |= (*(in + 17) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 17) - base) >> (24 - 16); - tmp |= (*(in + 18) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 18) - base) >> (24 - 8); - tmp |= (*(in + 19) - base) << 8; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 60) */ - tmp = (*(in + 20) - base) << 0; - tmp |= (*(in + 21) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 64) */ - tmp = (*(in + 21) - base) >> (24 - 16); - tmp |= (*(in + 22) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 68) */ - tmp = (*(in + 22) - base) >> (24 - 8); - tmp |= (*(in + 23) - base) << 8; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 72) */ - tmp = (*(in + 24) - base) << 0; - tmp |= (*(in + 25) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 76) */ - tmp = (*(in + 25) - base) >> (24 - 16); - tmp |= (*(in + 26) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 80) */ - tmp = (*(in + 26) - base) >> (24 - 8); - tmp |= (*(in + 27) - base) << 8; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 84) */ - tmp = (*(in + 28) - base) << 0; - tmp |= (*(in + 29) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 88) */ - tmp = (*(in + 29) - base) >> (24 - 16); - tmp |= (*(in + 30) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 92) */ - tmp = (*(in + 30) - base) >> (24 - 8); - tmp |= (*(in + 31) - base) << 8; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 96) */ - memcpy(out, &tmp, length); - return 96; -} - -static uint32_t -unpack24_32(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 16777215); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 16)) << (24 - 16); - *(out + 1) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 8)) << (24 - 8); - *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 8) & 16777215); - in32++; - /* consumed: 4 bytes (total: 16) */ - *(out + 4) = base + ((*in32 >> 0) & 16777215); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 16)) << (24 - 16); - *(out + 5) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 8)) << (24 - 8); - *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 8) & 16777215); - in32++; - /* consumed: 4 bytes (total: 28) */ - *(out + 8) = base + ((*in32 >> 0) & 16777215); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 16)) << (24 - 16); - *(out + 9) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 8)) << (24 - 8); - *(out + 10) = base + tmp; - *(out + 11) = base + ((*in32 >> 8) & 16777215); - in32++; - /* consumed: 4 bytes (total: 40) */ - *(out + 12) = base + ((*in32 >> 0) & 16777215); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 44) */ - tmp |= (*in32 % (1U << 16)) << (24 - 16); - *(out + 13) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 48) */ - tmp |= (*in32 % (1U << 8)) << (24 - 8); - *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 8) & 16777215); - in32++; - /* consumed: 4 bytes (total: 52) */ - *(out + 16) = base + ((*in32 >> 0) & 16777215); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 56) */ - tmp |= (*in32 % (1U << 16)) << (24 - 16); - *(out + 17) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 60) */ - tmp |= (*in32 % (1U << 8)) << (24 - 8); - *(out + 18) = base + tmp; - *(out + 19) = base + ((*in32 >> 8) & 16777215); - in32++; - /* consumed: 4 bytes (total: 64) */ - *(out + 20) = base + ((*in32 >> 0) & 16777215); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 68) */ - tmp |= (*in32 % (1U << 16)) << (24 - 16); - *(out + 21) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 72) */ - tmp |= (*in32 % (1U << 8)) << (24 - 8); - *(out + 22) = base + tmp; - *(out + 23) = base + ((*in32 >> 8) & 16777215); - in32++; - /* consumed: 4 bytes (total: 76) */ - *(out + 24) = base + ((*in32 >> 0) & 16777215); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 80) */ - tmp |= (*in32 % (1U << 16)) << (24 - 16); - *(out + 25) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 84) */ - tmp |= (*in32 % (1U << 8)) << (24 - 8); - *(out + 26) = base + tmp; - *(out + 27) = base + ((*in32 >> 8) & 16777215); - in32++; - /* consumed: 4 bytes (total: 88) */ - *(out + 28) = base + ((*in32 >> 0) & 16777215); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 92) */ - tmp |= (*in32 % (1U << 16)) << (24 - 16); - *(out + 29) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 96) */ - tmp |= (*in32 % (1U << 8)) << (24 - 8); - *(out + 30) = base + tmp; - *(out + 31) = base + ((*in32 >> 8) & 16777215); - /* remaining: 0 bits */ - return 96; -} - -static uint32_t -pack25_32(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 25; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (25 - 18); - tmp |= (*(in + 2) - base) << 18; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (25 - 11); - tmp |= (*(in + 3) - base) << 11; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (25 - 4); - tmp |= (*(in + 4) - base) << 4; - tmp |= (*(in + 5) - base) << 29; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 5) - base) >> (25 - 22); - tmp |= (*(in + 6) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 6) - base) >> (25 - 15); - tmp |= (*(in + 7) - base) << 15; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 7) - base) >> (25 - 8); - tmp |= (*(in + 8) - base) << 8; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 8) - base) >> (25 - 1); - tmp |= (*(in + 9) - base) << 1; - tmp |= (*(in + 10) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 10) - base) >> (25 - 19); - tmp |= (*(in + 11) - base) << 19; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 11) - base) >> (25 - 12); - tmp |= (*(in + 12) - base) << 12; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 12) - base) >> (25 - 5); - tmp |= (*(in + 13) - base) << 5; - tmp |= (*(in + 14) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 14) - base) >> (25 - 23); - tmp |= (*(in + 15) - base) << 23; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 15) - base) >> (25 - 16); - tmp |= (*(in + 16) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 16) - base) >> (25 - 9); - tmp |= (*(in + 17) - base) << 9; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 17) - base) >> (25 - 2); - tmp |= (*(in + 18) - base) << 2; - tmp |= (*(in + 19) - base) << 27; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 60) */ - tmp = (*(in + 19) - base) >> (25 - 20); - tmp |= (*(in + 20) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 64) */ - tmp = (*(in + 20) - base) >> (25 - 13); - tmp |= (*(in + 21) - base) << 13; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 68) */ - tmp = (*(in + 21) - base) >> (25 - 6); - tmp |= (*(in + 22) - base) << 6; - tmp |= (*(in + 23) - base) << 31; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 72) */ - tmp = (*(in + 23) - base) >> (25 - 24); - tmp |= (*(in + 24) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 76) */ - tmp = (*(in + 24) - base) >> (25 - 17); - tmp |= (*(in + 25) - base) << 17; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 80) */ - tmp = (*(in + 25) - base) >> (25 - 10); - tmp |= (*(in + 26) - base) << 10; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 84) */ - tmp = (*(in + 26) - base) >> (25 - 3); - tmp |= (*(in + 27) - base) << 3; - tmp |= (*(in + 28) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 88) */ - tmp = (*(in + 28) - base) >> (25 - 21); - tmp |= (*(in + 29) - base) << 21; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 92) */ - tmp = (*(in + 29) - base) >> (25 - 14); - tmp |= (*(in + 30) - base) << 14; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 96) */ - tmp = (*(in + 30) - base) >> (25 - 7); - tmp |= (*(in + 31) - base) << 7; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 100) */ - memcpy(out, &tmp, length); - return 100; -} - -static uint32_t -unpack25_32(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 33554431); - tmp = (*in32 >> 25); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 18)) << (25 - 18); - *(out + 1) = base + tmp; - tmp = (*in32 >> 18); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 11)) << (25 - 11); - *(out + 2) = base + tmp; - tmp = (*in32 >> 11); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 4)) << (25 - 4); - *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 4) & 33554431); - tmp = (*in32 >> 29); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 22)) << (25 - 22); - *(out + 5) = base + tmp; - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 15)) << (25 - 15); - *(out + 6) = base + tmp; - tmp = (*in32 >> 15); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 8)) << (25 - 8); - *(out + 7) = base + tmp; - tmp = (*in32 >> 8); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 1)) << (25 - 1); - *(out + 8) = base + tmp; - *(out + 9) = base + ((*in32 >> 1) & 33554431); - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 19)) << (25 - 19); - *(out + 10) = base + tmp; - tmp = (*in32 >> 19); - in32++; - /* consumed: 4 bytes (total: 40) */ - tmp |= (*in32 % (1U << 12)) << (25 - 12); - *(out + 11) = base + tmp; - tmp = (*in32 >> 12); - in32++; - /* consumed: 4 bytes (total: 44) */ - tmp |= (*in32 % (1U << 5)) << (25 - 5); - *(out + 12) = base + tmp; - *(out + 13) = base + ((*in32 >> 5) & 33554431); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 48) */ - tmp |= (*in32 % (1U << 23)) << (25 - 23); - *(out + 14) = base + tmp; - tmp = (*in32 >> 23); - in32++; - /* consumed: 4 bytes (total: 52) */ - tmp |= (*in32 % (1U << 16)) << (25 - 16); - *(out + 15) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 56) */ - tmp |= (*in32 % (1U << 9)) << (25 - 9); - *(out + 16) = base + tmp; - tmp = (*in32 >> 9); - in32++; - /* consumed: 4 bytes (total: 60) */ - tmp |= (*in32 % (1U << 2)) << (25 - 2); - *(out + 17) = base + tmp; - *(out + 18) = base + ((*in32 >> 2) & 33554431); - tmp = (*in32 >> 27); - in32++; - /* consumed: 4 bytes (total: 64) */ - tmp |= (*in32 % (1U << 20)) << (25 - 20); - *(out + 19) = base + tmp; - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 68) */ - tmp |= (*in32 % (1U << 13)) << (25 - 13); - *(out + 20) = base + tmp; - tmp = (*in32 >> 13); - in32++; - /* consumed: 4 bytes (total: 72) */ - tmp |= (*in32 % (1U << 6)) << (25 - 6); - *(out + 21) = base + tmp; - *(out + 22) = base + ((*in32 >> 6) & 33554431); - tmp = (*in32 >> 31); - in32++; - /* consumed: 4 bytes (total: 76) */ - tmp |= (*in32 % (1U << 24)) << (25 - 24); - *(out + 23) = base + tmp; - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 80) */ - tmp |= (*in32 % (1U << 17)) << (25 - 17); - *(out + 24) = base + tmp; - tmp = (*in32 >> 17); - in32++; - /* consumed: 4 bytes (total: 84) */ - tmp |= (*in32 % (1U << 10)) << (25 - 10); - *(out + 25) = base + tmp; - tmp = (*in32 >> 10); - in32++; - /* consumed: 4 bytes (total: 88) */ - tmp |= (*in32 % (1U << 3)) << (25 - 3); - *(out + 26) = base + tmp; - *(out + 27) = base + ((*in32 >> 3) & 33554431); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 92) */ - tmp |= (*in32 % (1U << 21)) << (25 - 21); - *(out + 28) = base + tmp; - tmp = (*in32 >> 21); - in32++; - /* consumed: 4 bytes (total: 96) */ - tmp |= (*in32 % (1U << 14)) << (25 - 14); - *(out + 29) = base + tmp; - tmp = (*in32 >> 14); - in32++; - /* consumed: 4 bytes (total: 100) */ - tmp |= (*in32 % (1U << 7)) << (25 - 7); - *(out + 30) = base + tmp; - *(out + 31) = base + ((*in32 >> 7) & 33554431); - /* remaining: 0 bits */ - return 100; -} - -static uint32_t -pack26_32(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (26 - 20); - tmp |= (*(in + 2) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (26 - 14); - tmp |= (*(in + 3) - base) << 14; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (26 - 8); - tmp |= (*(in + 4) - base) << 8; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (26 - 2); - tmp |= (*(in + 5) - base) << 2; - tmp |= (*(in + 6) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 6) - base) >> (26 - 22); - tmp |= (*(in + 7) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 7) - base) >> (26 - 16); - tmp |= (*(in + 8) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 8) - base) >> (26 - 10); - tmp |= (*(in + 9) - base) << 10; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 9) - base) >> (26 - 4); - tmp |= (*(in + 10) - base) << 4; - tmp |= (*(in + 11) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 11) - base) >> (26 - 24); - tmp |= (*(in + 12) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 12) - base) >> (26 - 18); - tmp |= (*(in + 13) - base) << 18; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 13) - base) >> (26 - 12); - tmp |= (*(in + 14) - base) << 12; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 14) - base) >> (26 - 6); - tmp |= (*(in + 15) - base) << 6; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 16) - base) << 0; - tmp |= (*(in + 17) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 17) - base) >> (26 - 20); - tmp |= (*(in + 18) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 60) */ - tmp = (*(in + 18) - base) >> (26 - 14); - tmp |= (*(in + 19) - base) << 14; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 64) */ - tmp = (*(in + 19) - base) >> (26 - 8); - tmp |= (*(in + 20) - base) << 8; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 68) */ - tmp = (*(in + 20) - base) >> (26 - 2); - tmp |= (*(in + 21) - base) << 2; - tmp |= (*(in + 22) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 72) */ - tmp = (*(in + 22) - base) >> (26 - 22); - tmp |= (*(in + 23) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 76) */ - tmp = (*(in + 23) - base) >> (26 - 16); - tmp |= (*(in + 24) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 80) */ - tmp = (*(in + 24) - base) >> (26 - 10); - tmp |= (*(in + 25) - base) << 10; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 84) */ - tmp = (*(in + 25) - base) >> (26 - 4); - tmp |= (*(in + 26) - base) << 4; - tmp |= (*(in + 27) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 88) */ - tmp = (*(in + 27) - base) >> (26 - 24); - tmp |= (*(in + 28) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 92) */ - tmp = (*(in + 28) - base) >> (26 - 18); - tmp |= (*(in + 29) - base) << 18; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 96) */ - tmp = (*(in + 29) - base) >> (26 - 12); - tmp |= (*(in + 30) - base) << 12; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 100) */ - tmp = (*(in + 30) - base) >> (26 - 6); - tmp |= (*(in + 31) - base) << 6; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 104) */ - memcpy(out, &tmp, length); - return 104; -} - -static uint32_t -unpack26_32(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 67108863); - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 20)) << (26 - 20); - *(out + 1) = base + tmp; - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 14)) << (26 - 14); - *(out + 2) = base + tmp; - tmp = (*in32 >> 14); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 8)) << (26 - 8); - *(out + 3) = base + tmp; - tmp = (*in32 >> 8); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 2)) << (26 - 2); - *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 2) & 67108863); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 22)) << (26 - 22); - *(out + 6) = base + tmp; - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 16)) << (26 - 16); - *(out + 7) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 10)) << (26 - 10); - *(out + 8) = base + tmp; - tmp = (*in32 >> 10); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 4)) << (26 - 4); - *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 4) & 67108863); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 40) */ - tmp |= (*in32 % (1U << 24)) << (26 - 24); - *(out + 11) = base + tmp; - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 44) */ - tmp |= (*in32 % (1U << 18)) << (26 - 18); - *(out + 12) = base + tmp; - tmp = (*in32 >> 18); - in32++; - /* consumed: 4 bytes (total: 48) */ - tmp |= (*in32 % (1U << 12)) << (26 - 12); - *(out + 13) = base + tmp; - tmp = (*in32 >> 12); - in32++; - /* consumed: 4 bytes (total: 52) */ - tmp |= (*in32 % (1U << 6)) << (26 - 6); - *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 6) & 67108863); - in32++; - /* consumed: 4 bytes (total: 56) */ - *(out + 16) = base + ((*in32 >> 0) & 67108863); - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 60) */ - tmp |= (*in32 % (1U << 20)) << (26 - 20); - *(out + 17) = base + tmp; - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 64) */ - tmp |= (*in32 % (1U << 14)) << (26 - 14); - *(out + 18) = base + tmp; - tmp = (*in32 >> 14); - in32++; - /* consumed: 4 bytes (total: 68) */ - tmp |= (*in32 % (1U << 8)) << (26 - 8); - *(out + 19) = base + tmp; - tmp = (*in32 >> 8); - in32++; - /* consumed: 4 bytes (total: 72) */ - tmp |= (*in32 % (1U << 2)) << (26 - 2); - *(out + 20) = base + tmp; - *(out + 21) = base + ((*in32 >> 2) & 67108863); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 76) */ - tmp |= (*in32 % (1U << 22)) << (26 - 22); - *(out + 22) = base + tmp; - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 80) */ - tmp |= (*in32 % (1U << 16)) << (26 - 16); - *(out + 23) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 84) */ - tmp |= (*in32 % (1U << 10)) << (26 - 10); - *(out + 24) = base + tmp; - tmp = (*in32 >> 10); - in32++; - /* consumed: 4 bytes (total: 88) */ - tmp |= (*in32 % (1U << 4)) << (26 - 4); - *(out + 25) = base + tmp; - *(out + 26) = base + ((*in32 >> 4) & 67108863); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 92) */ - tmp |= (*in32 % (1U << 24)) << (26 - 24); - *(out + 27) = base + tmp; - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 96) */ - tmp |= (*in32 % (1U << 18)) << (26 - 18); - *(out + 28) = base + tmp; - tmp = (*in32 >> 18); - in32++; - /* consumed: 4 bytes (total: 100) */ - tmp |= (*in32 % (1U << 12)) << (26 - 12); - *(out + 29) = base + tmp; - tmp = (*in32 >> 12); - in32++; - /* consumed: 4 bytes (total: 104) */ - tmp |= (*in32 % (1U << 6)) << (26 - 6); - *(out + 30) = base + tmp; - *(out + 31) = base + ((*in32 >> 6) & 67108863); - /* remaining: 0 bits */ - return 104; -} - -static uint32_t -pack27_32(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 27; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (27 - 22); - tmp |= (*(in + 2) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (27 - 17); - tmp |= (*(in + 3) - base) << 17; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (27 - 12); - tmp |= (*(in + 4) - base) << 12; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (27 - 7); - tmp |= (*(in + 5) - base) << 7; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 5) - base) >> (27 - 2); - tmp |= (*(in + 6) - base) << 2; - tmp |= (*(in + 7) - base) << 29; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 7) - base) >> (27 - 24); - tmp |= (*(in + 8) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 8) - base) >> (27 - 19); - tmp |= (*(in + 9) - base) << 19; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 9) - base) >> (27 - 14); - tmp |= (*(in + 10) - base) << 14; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 10) - base) >> (27 - 9); - tmp |= (*(in + 11) - base) << 9; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 11) - base) >> (27 - 4); - tmp |= (*(in + 12) - base) << 4; - tmp |= (*(in + 13) - base) << 31; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 13) - base) >> (27 - 26); - tmp |= (*(in + 14) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 14) - base) >> (27 - 21); - tmp |= (*(in + 15) - base) << 21; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 15) - base) >> (27 - 16); - tmp |= (*(in + 16) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 16) - base) >> (27 - 11); - tmp |= (*(in + 17) - base) << 11; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 60) */ - tmp = (*(in + 17) - base) >> (27 - 6); - tmp |= (*(in + 18) - base) << 6; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 64) */ - tmp = (*(in + 18) - base) >> (27 - 1); - tmp |= (*(in + 19) - base) << 1; - tmp |= (*(in + 20) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 68) */ - tmp = (*(in + 20) - base) >> (27 - 23); - tmp |= (*(in + 21) - base) << 23; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 72) */ - tmp = (*(in + 21) - base) >> (27 - 18); - tmp |= (*(in + 22) - base) << 18; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 76) */ - tmp = (*(in + 22) - base) >> (27 - 13); - tmp |= (*(in + 23) - base) << 13; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 80) */ - tmp = (*(in + 23) - base) >> (27 - 8); - tmp |= (*(in + 24) - base) << 8; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 84) */ - tmp = (*(in + 24) - base) >> (27 - 3); - tmp |= (*(in + 25) - base) << 3; - tmp |= (*(in + 26) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 88) */ - tmp = (*(in + 26) - base) >> (27 - 25); - tmp |= (*(in + 27) - base) << 25; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 92) */ - tmp = (*(in + 27) - base) >> (27 - 20); - tmp |= (*(in + 28) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 96) */ - tmp = (*(in + 28) - base) >> (27 - 15); - tmp |= (*(in + 29) - base) << 15; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 100) */ - tmp = (*(in + 29) - base) >> (27 - 10); - tmp |= (*(in + 30) - base) << 10; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 104) */ - tmp = (*(in + 30) - base) >> (27 - 5); - tmp |= (*(in + 31) - base) << 5; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 108) */ - memcpy(out, &tmp, length); - return 108; -} - -static uint32_t -unpack27_32(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 134217727); - tmp = (*in32 >> 27); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 22)) << (27 - 22); - *(out + 1) = base + tmp; - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 17)) << (27 - 17); - *(out + 2) = base + tmp; - tmp = (*in32 >> 17); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 12)) << (27 - 12); - *(out + 3) = base + tmp; - tmp = (*in32 >> 12); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 7)) << (27 - 7); - *(out + 4) = base + tmp; - tmp = (*in32 >> 7); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 2)) << (27 - 2); - *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 2) & 134217727); - tmp = (*in32 >> 29); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 24)) << (27 - 24); - *(out + 7) = base + tmp; - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 19)) << (27 - 19); - *(out + 8) = base + tmp; - tmp = (*in32 >> 19); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 14)) << (27 - 14); - *(out + 9) = base + tmp; - tmp = (*in32 >> 14); - in32++; - /* consumed: 4 bytes (total: 40) */ - tmp |= (*in32 % (1U << 9)) << (27 - 9); - *(out + 10) = base + tmp; - tmp = (*in32 >> 9); - in32++; - /* consumed: 4 bytes (total: 44) */ - tmp |= (*in32 % (1U << 4)) << (27 - 4); - *(out + 11) = base + tmp; - *(out + 12) = base + ((*in32 >> 4) & 134217727); - tmp = (*in32 >> 31); - in32++; - /* consumed: 4 bytes (total: 48) */ - tmp |= (*in32 % (1U << 26)) << (27 - 26); - *(out + 13) = base + tmp; - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 52) */ - tmp |= (*in32 % (1U << 21)) << (27 - 21); - *(out + 14) = base + tmp; - tmp = (*in32 >> 21); - in32++; - /* consumed: 4 bytes (total: 56) */ - tmp |= (*in32 % (1U << 16)) << (27 - 16); - *(out + 15) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 60) */ - tmp |= (*in32 % (1U << 11)) << (27 - 11); - *(out + 16) = base + tmp; - tmp = (*in32 >> 11); - in32++; - /* consumed: 4 bytes (total: 64) */ - tmp |= (*in32 % (1U << 6)) << (27 - 6); - *(out + 17) = base + tmp; - tmp = (*in32 >> 6); - in32++; - /* consumed: 4 bytes (total: 68) */ - tmp |= (*in32 % (1U << 1)) << (27 - 1); - *(out + 18) = base + tmp; - *(out + 19) = base + ((*in32 >> 1) & 134217727); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 72) */ - tmp |= (*in32 % (1U << 23)) << (27 - 23); - *(out + 20) = base + tmp; - tmp = (*in32 >> 23); - in32++; - /* consumed: 4 bytes (total: 76) */ - tmp |= (*in32 % (1U << 18)) << (27 - 18); - *(out + 21) = base + tmp; - tmp = (*in32 >> 18); - in32++; - /* consumed: 4 bytes (total: 80) */ - tmp |= (*in32 % (1U << 13)) << (27 - 13); - *(out + 22) = base + tmp; - tmp = (*in32 >> 13); - in32++; - /* consumed: 4 bytes (total: 84) */ - tmp |= (*in32 % (1U << 8)) << (27 - 8); - *(out + 23) = base + tmp; - tmp = (*in32 >> 8); - in32++; - /* consumed: 4 bytes (total: 88) */ - tmp |= (*in32 % (1U << 3)) << (27 - 3); - *(out + 24) = base + tmp; - *(out + 25) = base + ((*in32 >> 3) & 134217727); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 92) */ - tmp |= (*in32 % (1U << 25)) << (27 - 25); - *(out + 26) = base + tmp; - tmp = (*in32 >> 25); - in32++; - /* consumed: 4 bytes (total: 96) */ - tmp |= (*in32 % (1U << 20)) << (27 - 20); - *(out + 27) = base + tmp; - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 100) */ - tmp |= (*in32 % (1U << 15)) << (27 - 15); - *(out + 28) = base + tmp; - tmp = (*in32 >> 15); - in32++; - /* consumed: 4 bytes (total: 104) */ - tmp |= (*in32 % (1U << 10)) << (27 - 10); - *(out + 29) = base + tmp; - tmp = (*in32 >> 10); - in32++; - /* consumed: 4 bytes (total: 108) */ - tmp |= (*in32 % (1U << 5)) << (27 - 5); - *(out + 30) = base + tmp; - *(out + 31) = base + ((*in32 >> 5) & 134217727); - /* remaining: 0 bits */ - return 108; -} - -static uint32_t -pack28_32(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (28 - 24); - tmp |= (*(in + 2) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (28 - 20); - tmp |= (*(in + 3) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (28 - 16); - tmp |= (*(in + 4) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (28 - 12); - tmp |= (*(in + 5) - base) << 12; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 5) - base) >> (28 - 8); - tmp |= (*(in + 6) - base) << 8; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 6) - base) >> (28 - 4); - tmp |= (*(in + 7) - base) << 4; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 8) - base) << 0; - tmp |= (*(in + 9) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 9) - base) >> (28 - 24); - tmp |= (*(in + 10) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 10) - base) >> (28 - 20); - tmp |= (*(in + 11) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 11) - base) >> (28 - 16); - tmp |= (*(in + 12) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 12) - base) >> (28 - 12); - tmp |= (*(in + 13) - base) << 12; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 13) - base) >> (28 - 8); - tmp |= (*(in + 14) - base) << 8; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 14) - base) >> (28 - 4); - tmp |= (*(in + 15) - base) << 4; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 16) - base) << 0; - tmp |= (*(in + 17) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 60) */ - tmp = (*(in + 17) - base) >> (28 - 24); - tmp |= (*(in + 18) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 64) */ - tmp = (*(in + 18) - base) >> (28 - 20); - tmp |= (*(in + 19) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 68) */ - tmp = (*(in + 19) - base) >> (28 - 16); - tmp |= (*(in + 20) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 72) */ - tmp = (*(in + 20) - base) >> (28 - 12); - tmp |= (*(in + 21) - base) << 12; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 76) */ - tmp = (*(in + 21) - base) >> (28 - 8); - tmp |= (*(in + 22) - base) << 8; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 80) */ - tmp = (*(in + 22) - base) >> (28 - 4); - tmp |= (*(in + 23) - base) << 4; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 84) */ - tmp = (*(in + 24) - base) << 0; - tmp |= (*(in + 25) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 88) */ - tmp = (*(in + 25) - base) >> (28 - 24); - tmp |= (*(in + 26) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 92) */ - tmp = (*(in + 26) - base) >> (28 - 20); - tmp |= (*(in + 27) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 96) */ - tmp = (*(in + 27) - base) >> (28 - 16); - tmp |= (*(in + 28) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 100) */ - tmp = (*(in + 28) - base) >> (28 - 12); - tmp |= (*(in + 29) - base) << 12; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 104) */ - tmp = (*(in + 29) - base) >> (28 - 8); - tmp |= (*(in + 30) - base) << 8; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 108) */ - tmp = (*(in + 30) - base) >> (28 - 4); - tmp |= (*(in + 31) - base) << 4; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 112) */ - memcpy(out, &tmp, length); - return 112; -} - -static uint32_t -unpack28_32(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 268435455); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 24)) << (28 - 24); - *(out + 1) = base + tmp; - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 20)) << (28 - 20); - *(out + 2) = base + tmp; - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 16)) << (28 - 16); - *(out + 3) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 12)) << (28 - 12); - *(out + 4) = base + tmp; - tmp = (*in32 >> 12); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 8)) << (28 - 8); - *(out + 5) = base + tmp; - tmp = (*in32 >> 8); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 4)) << (28 - 4); - *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 4) & 268435455); - in32++; - /* consumed: 4 bytes (total: 32) */ - *(out + 8) = base + ((*in32 >> 0) & 268435455); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 24)) << (28 - 24); - *(out + 9) = base + tmp; - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 40) */ - tmp |= (*in32 % (1U << 20)) << (28 - 20); - *(out + 10) = base + tmp; - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 44) */ - tmp |= (*in32 % (1U << 16)) << (28 - 16); - *(out + 11) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 48) */ - tmp |= (*in32 % (1U << 12)) << (28 - 12); - *(out + 12) = base + tmp; - tmp = (*in32 >> 12); - in32++; - /* consumed: 4 bytes (total: 52) */ - tmp |= (*in32 % (1U << 8)) << (28 - 8); - *(out + 13) = base + tmp; - tmp = (*in32 >> 8); - in32++; - /* consumed: 4 bytes (total: 56) */ - tmp |= (*in32 % (1U << 4)) << (28 - 4); - *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 4) & 268435455); - in32++; - /* consumed: 4 bytes (total: 60) */ - *(out + 16) = base + ((*in32 >> 0) & 268435455); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 64) */ - tmp |= (*in32 % (1U << 24)) << (28 - 24); - *(out + 17) = base + tmp; - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 68) */ - tmp |= (*in32 % (1U << 20)) << (28 - 20); - *(out + 18) = base + tmp; - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 72) */ - tmp |= (*in32 % (1U << 16)) << (28 - 16); - *(out + 19) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 76) */ - tmp |= (*in32 % (1U << 12)) << (28 - 12); - *(out + 20) = base + tmp; - tmp = (*in32 >> 12); - in32++; - /* consumed: 4 bytes (total: 80) */ - tmp |= (*in32 % (1U << 8)) << (28 - 8); - *(out + 21) = base + tmp; - tmp = (*in32 >> 8); - in32++; - /* consumed: 4 bytes (total: 84) */ - tmp |= (*in32 % (1U << 4)) << (28 - 4); - *(out + 22) = base + tmp; - *(out + 23) = base + ((*in32 >> 4) & 268435455); - in32++; - /* consumed: 4 bytes (total: 88) */ - *(out + 24) = base + ((*in32 >> 0) & 268435455); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 92) */ - tmp |= (*in32 % (1U << 24)) << (28 - 24); - *(out + 25) = base + tmp; - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 96) */ - tmp |= (*in32 % (1U << 20)) << (28 - 20); - *(out + 26) = base + tmp; - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 100) */ - tmp |= (*in32 % (1U << 16)) << (28 - 16); - *(out + 27) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 104) */ - tmp |= (*in32 % (1U << 12)) << (28 - 12); - *(out + 28) = base + tmp; - tmp = (*in32 >> 12); - in32++; - /* consumed: 4 bytes (total: 108) */ - tmp |= (*in32 % (1U << 8)) << (28 - 8); - *(out + 29) = base + tmp; - tmp = (*in32 >> 8); - in32++; - /* consumed: 4 bytes (total: 112) */ - tmp |= (*in32 % (1U << 4)) << (28 - 4); - *(out + 30) = base + tmp; - *(out + 31) = base + ((*in32 >> 4) & 268435455); - /* remaining: 0 bits */ - return 112; -} - -static uint32_t -pack29_32(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 29; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (29 - 26); - tmp |= (*(in + 2) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (29 - 23); - tmp |= (*(in + 3) - base) << 23; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (29 - 20); - tmp |= (*(in + 4) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (29 - 17); - tmp |= (*(in + 5) - base) << 17; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 5) - base) >> (29 - 14); - tmp |= (*(in + 6) - base) << 14; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 6) - base) >> (29 - 11); - tmp |= (*(in + 7) - base) << 11; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 7) - base) >> (29 - 8); - tmp |= (*(in + 8) - base) << 8; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 8) - base) >> (29 - 5); - tmp |= (*(in + 9) - base) << 5; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 9) - base) >> (29 - 2); - tmp |= (*(in + 10) - base) << 2; - tmp |= (*(in + 11) - base) << 31; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 11) - base) >> (29 - 28); - tmp |= (*(in + 12) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 12) - base) >> (29 - 25); - tmp |= (*(in + 13) - base) << 25; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 13) - base) >> (29 - 22); - tmp |= (*(in + 14) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 14) - base) >> (29 - 19); - tmp |= (*(in + 15) - base) << 19; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 15) - base) >> (29 - 16); - tmp |= (*(in + 16) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 60) */ - tmp = (*(in + 16) - base) >> (29 - 13); - tmp |= (*(in + 17) - base) << 13; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 64) */ - tmp = (*(in + 17) - base) >> (29 - 10); - tmp |= (*(in + 18) - base) << 10; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 68) */ - tmp = (*(in + 18) - base) >> (29 - 7); - tmp |= (*(in + 19) - base) << 7; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 72) */ - tmp = (*(in + 19) - base) >> (29 - 4); - tmp |= (*(in + 20) - base) << 4; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 76) */ - tmp = (*(in + 20) - base) >> (29 - 1); - tmp |= (*(in + 21) - base) << 1; - tmp |= (*(in + 22) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 80) */ - tmp = (*(in + 22) - base) >> (29 - 27); - tmp |= (*(in + 23) - base) << 27; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 84) */ - tmp = (*(in + 23) - base) >> (29 - 24); - tmp |= (*(in + 24) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 88) */ - tmp = (*(in + 24) - base) >> (29 - 21); - tmp |= (*(in + 25) - base) << 21; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 92) */ - tmp = (*(in + 25) - base) >> (29 - 18); - tmp |= (*(in + 26) - base) << 18; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 96) */ - tmp = (*(in + 26) - base) >> (29 - 15); - tmp |= (*(in + 27) - base) << 15; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 100) */ - tmp = (*(in + 27) - base) >> (29 - 12); - tmp |= (*(in + 28) - base) << 12; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 104) */ - tmp = (*(in + 28) - base) >> (29 - 9); - tmp |= (*(in + 29) - base) << 9; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 108) */ - tmp = (*(in + 29) - base) >> (29 - 6); - tmp |= (*(in + 30) - base) << 6; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 112) */ - tmp = (*(in + 30) - base) >> (29 - 3); - tmp |= (*(in + 31) - base) << 3; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 116) */ - memcpy(out, &tmp, length); - return 116; -} - -static uint32_t -unpack29_32(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 536870911); - tmp = (*in32 >> 29); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 26)) << (29 - 26); - *(out + 1) = base + tmp; - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 23)) << (29 - 23); - *(out + 2) = base + tmp; - tmp = (*in32 >> 23); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 20)) << (29 - 20); - *(out + 3) = base + tmp; - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 17)) << (29 - 17); - *(out + 4) = base + tmp; - tmp = (*in32 >> 17); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 14)) << (29 - 14); - *(out + 5) = base + tmp; - tmp = (*in32 >> 14); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 11)) << (29 - 11); - *(out + 6) = base + tmp; - tmp = (*in32 >> 11); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 8)) << (29 - 8); - *(out + 7) = base + tmp; - tmp = (*in32 >> 8); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 5)) << (29 - 5); - *(out + 8) = base + tmp; - tmp = (*in32 >> 5); - in32++; - /* consumed: 4 bytes (total: 40) */ - tmp |= (*in32 % (1U << 2)) << (29 - 2); - *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 2) & 536870911); - tmp = (*in32 >> 31); - in32++; - /* consumed: 4 bytes (total: 44) */ - tmp |= (*in32 % (1U << 28)) << (29 - 28); - *(out + 11) = base + tmp; - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 48) */ - tmp |= (*in32 % (1U << 25)) << (29 - 25); - *(out + 12) = base + tmp; - tmp = (*in32 >> 25); - in32++; - /* consumed: 4 bytes (total: 52) */ - tmp |= (*in32 % (1U << 22)) << (29 - 22); - *(out + 13) = base + tmp; - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 56) */ - tmp |= (*in32 % (1U << 19)) << (29 - 19); - *(out + 14) = base + tmp; - tmp = (*in32 >> 19); - in32++; - /* consumed: 4 bytes (total: 60) */ - tmp |= (*in32 % (1U << 16)) << (29 - 16); - *(out + 15) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 64) */ - tmp |= (*in32 % (1U << 13)) << (29 - 13); - *(out + 16) = base + tmp; - tmp = (*in32 >> 13); - in32++; - /* consumed: 4 bytes (total: 68) */ - tmp |= (*in32 % (1U << 10)) << (29 - 10); - *(out + 17) = base + tmp; - tmp = (*in32 >> 10); - in32++; - /* consumed: 4 bytes (total: 72) */ - tmp |= (*in32 % (1U << 7)) << (29 - 7); - *(out + 18) = base + tmp; - tmp = (*in32 >> 7); - in32++; - /* consumed: 4 bytes (total: 76) */ - tmp |= (*in32 % (1U << 4)) << (29 - 4); - *(out + 19) = base + tmp; - tmp = (*in32 >> 4); - in32++; - /* consumed: 4 bytes (total: 80) */ - tmp |= (*in32 % (1U << 1)) << (29 - 1); - *(out + 20) = base + tmp; - *(out + 21) = base + ((*in32 >> 1) & 536870911); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 84) */ - tmp |= (*in32 % (1U << 27)) << (29 - 27); - *(out + 22) = base + tmp; - tmp = (*in32 >> 27); - in32++; - /* consumed: 4 bytes (total: 88) */ - tmp |= (*in32 % (1U << 24)) << (29 - 24); - *(out + 23) = base + tmp; - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 92) */ - tmp |= (*in32 % (1U << 21)) << (29 - 21); - *(out + 24) = base + tmp; - tmp = (*in32 >> 21); - in32++; - /* consumed: 4 bytes (total: 96) */ - tmp |= (*in32 % (1U << 18)) << (29 - 18); - *(out + 25) = base + tmp; - tmp = (*in32 >> 18); - in32++; - /* consumed: 4 bytes (total: 100) */ - tmp |= (*in32 % (1U << 15)) << (29 - 15); - *(out + 26) = base + tmp; - tmp = (*in32 >> 15); - in32++; - /* consumed: 4 bytes (total: 104) */ - tmp |= (*in32 % (1U << 12)) << (29 - 12); - *(out + 27) = base + tmp; - tmp = (*in32 >> 12); - in32++; - /* consumed: 4 bytes (total: 108) */ - tmp |= (*in32 % (1U << 9)) << (29 - 9); - *(out + 28) = base + tmp; - tmp = (*in32 >> 9); - in32++; - /* consumed: 4 bytes (total: 112) */ - tmp |= (*in32 % (1U << 6)) << (29 - 6); - *(out + 29) = base + tmp; - tmp = (*in32 >> 6); - in32++; - /* consumed: 4 bytes (total: 116) */ - tmp |= (*in32 % (1U << 3)) << (29 - 3); - *(out + 30) = base + tmp; - *(out + 31) = base + ((*in32 >> 3) & 536870911); - /* remaining: 0 bits */ - return 116; -} - -static uint32_t -pack30_32(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (30 - 28); - tmp |= (*(in + 2) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (30 - 26); - tmp |= (*(in + 3) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (30 - 24); - tmp |= (*(in + 4) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (30 - 22); - tmp |= (*(in + 5) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 5) - base) >> (30 - 20); - tmp |= (*(in + 6) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 6) - base) >> (30 - 18); - tmp |= (*(in + 7) - base) << 18; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 7) - base) >> (30 - 16); - tmp |= (*(in + 8) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 8) - base) >> (30 - 14); - tmp |= (*(in + 9) - base) << 14; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 9) - base) >> (30 - 12); - tmp |= (*(in + 10) - base) << 12; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 10) - base) >> (30 - 10); - tmp |= (*(in + 11) - base) << 10; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 11) - base) >> (30 - 8); - tmp |= (*(in + 12) - base) << 8; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 12) - base) >> (30 - 6); - tmp |= (*(in + 13) - base) << 6; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 13) - base) >> (30 - 4); - tmp |= (*(in + 14) - base) << 4; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 14) - base) >> (30 - 2); - tmp |= (*(in + 15) - base) << 2; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 60) */ - tmp = (*(in + 16) - base) << 0; - tmp |= (*(in + 17) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 64) */ - tmp = (*(in + 17) - base) >> (30 - 28); - tmp |= (*(in + 18) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 68) */ - tmp = (*(in + 18) - base) >> (30 - 26); - tmp |= (*(in + 19) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 72) */ - tmp = (*(in + 19) - base) >> (30 - 24); - tmp |= (*(in + 20) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 76) */ - tmp = (*(in + 20) - base) >> (30 - 22); - tmp |= (*(in + 21) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 80) */ - tmp = (*(in + 21) - base) >> (30 - 20); - tmp |= (*(in + 22) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 84) */ - tmp = (*(in + 22) - base) >> (30 - 18); - tmp |= (*(in + 23) - base) << 18; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 88) */ - tmp = (*(in + 23) - base) >> (30 - 16); - tmp |= (*(in + 24) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 92) */ - tmp = (*(in + 24) - base) >> (30 - 14); - tmp |= (*(in + 25) - base) << 14; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 96) */ - tmp = (*(in + 25) - base) >> (30 - 12); - tmp |= (*(in + 26) - base) << 12; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 100) */ - tmp = (*(in + 26) - base) >> (30 - 10); - tmp |= (*(in + 27) - base) << 10; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 104) */ - tmp = (*(in + 27) - base) >> (30 - 8); - tmp |= (*(in + 28) - base) << 8; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 108) */ - tmp = (*(in + 28) - base) >> (30 - 6); - tmp |= (*(in + 29) - base) << 6; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 112) */ - tmp = (*(in + 29) - base) >> (30 - 4); - tmp |= (*(in + 30) - base) << 4; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 116) */ - tmp = (*(in + 30) - base) >> (30 - 2); - tmp |= (*(in + 31) - base) << 2; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 120) */ - memcpy(out, &tmp, length); - return 120; -} - -static uint32_t -unpack30_32(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 1073741823); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 28)) << (30 - 28); - *(out + 1) = base + tmp; - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 26)) << (30 - 26); - *(out + 2) = base + tmp; - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 24)) << (30 - 24); - *(out + 3) = base + tmp; - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 22)) << (30 - 22); - *(out + 4) = base + tmp; - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 20)) << (30 - 20); - *(out + 5) = base + tmp; - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 18)) << (30 - 18); - *(out + 6) = base + tmp; - tmp = (*in32 >> 18); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 16)) << (30 - 16); - *(out + 7) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 14)) << (30 - 14); - *(out + 8) = base + tmp; - tmp = (*in32 >> 14); - in32++; - /* consumed: 4 bytes (total: 40) */ - tmp |= (*in32 % (1U << 12)) << (30 - 12); - *(out + 9) = base + tmp; - tmp = (*in32 >> 12); - in32++; - /* consumed: 4 bytes (total: 44) */ - tmp |= (*in32 % (1U << 10)) << (30 - 10); - *(out + 10) = base + tmp; - tmp = (*in32 >> 10); - in32++; - /* consumed: 4 bytes (total: 48) */ - tmp |= (*in32 % (1U << 8)) << (30 - 8); - *(out + 11) = base + tmp; - tmp = (*in32 >> 8); - in32++; - /* consumed: 4 bytes (total: 52) */ - tmp |= (*in32 % (1U << 6)) << (30 - 6); - *(out + 12) = base + tmp; - tmp = (*in32 >> 6); - in32++; - /* consumed: 4 bytes (total: 56) */ - tmp |= (*in32 % (1U << 4)) << (30 - 4); - *(out + 13) = base + tmp; - tmp = (*in32 >> 4); - in32++; - /* consumed: 4 bytes (total: 60) */ - tmp |= (*in32 % (1U << 2)) << (30 - 2); - *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 2) & 1073741823); - in32++; - /* consumed: 4 bytes (total: 64) */ - *(out + 16) = base + ((*in32 >> 0) & 1073741823); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 68) */ - tmp |= (*in32 % (1U << 28)) << (30 - 28); - *(out + 17) = base + tmp; - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 72) */ - tmp |= (*in32 % (1U << 26)) << (30 - 26); - *(out + 18) = base + tmp; - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 76) */ - tmp |= (*in32 % (1U << 24)) << (30 - 24); - *(out + 19) = base + tmp; - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 80) */ - tmp |= (*in32 % (1U << 22)) << (30 - 22); - *(out + 20) = base + tmp; - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 84) */ - tmp |= (*in32 % (1U << 20)) << (30 - 20); - *(out + 21) = base + tmp; - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 88) */ - tmp |= (*in32 % (1U << 18)) << (30 - 18); - *(out + 22) = base + tmp; - tmp = (*in32 >> 18); - in32++; - /* consumed: 4 bytes (total: 92) */ - tmp |= (*in32 % (1U << 16)) << (30 - 16); - *(out + 23) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 96) */ - tmp |= (*in32 % (1U << 14)) << (30 - 14); - *(out + 24) = base + tmp; - tmp = (*in32 >> 14); - in32++; - /* consumed: 4 bytes (total: 100) */ - tmp |= (*in32 % (1U << 12)) << (30 - 12); - *(out + 25) = base + tmp; - tmp = (*in32 >> 12); - in32++; - /* consumed: 4 bytes (total: 104) */ - tmp |= (*in32 % (1U << 10)) << (30 - 10); - *(out + 26) = base + tmp; - tmp = (*in32 >> 10); - in32++; - /* consumed: 4 bytes (total: 108) */ - tmp |= (*in32 % (1U << 8)) << (30 - 8); - *(out + 27) = base + tmp; - tmp = (*in32 >> 8); - in32++; - /* consumed: 4 bytes (total: 112) */ - tmp |= (*in32 % (1U << 6)) << (30 - 6); - *(out + 28) = base + tmp; - tmp = (*in32 >> 6); - in32++; - /* consumed: 4 bytes (total: 116) */ - tmp |= (*in32 % (1U << 4)) << (30 - 4); - *(out + 29) = base + tmp; - tmp = (*in32 >> 4); - in32++; - /* consumed: 4 bytes (total: 120) */ - tmp |= (*in32 % (1U << 2)) << (30 - 2); - *(out + 30) = base + tmp; - *(out + 31) = base + ((*in32 >> 2) & 1073741823); - /* remaining: 0 bits */ - return 120; -} - -static uint32_t -pack31_32(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 31; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (31 - 30); - tmp |= (*(in + 2) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (31 - 29); - tmp |= (*(in + 3) - base) << 29; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (31 - 28); - tmp |= (*(in + 4) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (31 - 27); - tmp |= (*(in + 5) - base) << 27; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 5) - base) >> (31 - 26); - tmp |= (*(in + 6) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 6) - base) >> (31 - 25); - tmp |= (*(in + 7) - base) << 25; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 7) - base) >> (31 - 24); - tmp |= (*(in + 8) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 8) - base) >> (31 - 23); - tmp |= (*(in + 9) - base) << 23; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 9) - base) >> (31 - 22); - tmp |= (*(in + 10) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 10) - base) >> (31 - 21); - tmp |= (*(in + 11) - base) << 21; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 11) - base) >> (31 - 20); - tmp |= (*(in + 12) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 12) - base) >> (31 - 19); - tmp |= (*(in + 13) - base) << 19; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 13) - base) >> (31 - 18); - tmp |= (*(in + 14) - base) << 18; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 14) - base) >> (31 - 17); - tmp |= (*(in + 15) - base) << 17; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 60) */ - tmp = (*(in + 15) - base) >> (31 - 16); - tmp |= (*(in + 16) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 64) */ - tmp = (*(in + 16) - base) >> (31 - 15); - tmp |= (*(in + 17) - base) << 15; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 68) */ - tmp = (*(in + 17) - base) >> (31 - 14); - tmp |= (*(in + 18) - base) << 14; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 72) */ - tmp = (*(in + 18) - base) >> (31 - 13); - tmp |= (*(in + 19) - base) << 13; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 76) */ - tmp = (*(in + 19) - base) >> (31 - 12); - tmp |= (*(in + 20) - base) << 12; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 80) */ - tmp = (*(in + 20) - base) >> (31 - 11); - tmp |= (*(in + 21) - base) << 11; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 84) */ - tmp = (*(in + 21) - base) >> (31 - 10); - tmp |= (*(in + 22) - base) << 10; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 88) */ - tmp = (*(in + 22) - base) >> (31 - 9); - tmp |= (*(in + 23) - base) << 9; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 92) */ - tmp = (*(in + 23) - base) >> (31 - 8); - tmp |= (*(in + 24) - base) << 8; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 96) */ - tmp = (*(in + 24) - base) >> (31 - 7); - tmp |= (*(in + 25) - base) << 7; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 100) */ - tmp = (*(in + 25) - base) >> (31 - 6); - tmp |= (*(in + 26) - base) << 6; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 104) */ - tmp = (*(in + 26) - base) >> (31 - 5); - tmp |= (*(in + 27) - base) << 5; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 108) */ - tmp = (*(in + 27) - base) >> (31 - 4); - tmp |= (*(in + 28) - base) << 4; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 112) */ - tmp = (*(in + 28) - base) >> (31 - 3); - tmp |= (*(in + 29) - base) << 3; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 116) */ - tmp = (*(in + 29) - base) >> (31 - 2); - tmp |= (*(in + 30) - base) << 2; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 120) */ - tmp = (*(in + 30) - base) >> (31 - 1); - tmp |= (*(in + 31) - base) << 1; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 124) */ - memcpy(out, &tmp, length); - return 124; -} - -static uint32_t -unpack31_32(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 2147483647); - tmp = (*in32 >> 31); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 30)) << (31 - 30); - *(out + 1) = base + tmp; - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 29)) << (31 - 29); - *(out + 2) = base + tmp; - tmp = (*in32 >> 29); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 28)) << (31 - 28); - *(out + 3) = base + tmp; - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 27)) << (31 - 27); - *(out + 4) = base + tmp; - tmp = (*in32 >> 27); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 26)) << (31 - 26); - *(out + 5) = base + tmp; - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 25)) << (31 - 25); - *(out + 6) = base + tmp; - tmp = (*in32 >> 25); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 24)) << (31 - 24); - *(out + 7) = base + tmp; - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 23)) << (31 - 23); - *(out + 8) = base + tmp; - tmp = (*in32 >> 23); - in32++; - /* consumed: 4 bytes (total: 40) */ - tmp |= (*in32 % (1U << 22)) << (31 - 22); - *(out + 9) = base + tmp; - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 44) */ - tmp |= (*in32 % (1U << 21)) << (31 - 21); - *(out + 10) = base + tmp; - tmp = (*in32 >> 21); - in32++; - /* consumed: 4 bytes (total: 48) */ - tmp |= (*in32 % (1U << 20)) << (31 - 20); - *(out + 11) = base + tmp; - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 52) */ - tmp |= (*in32 % (1U << 19)) << (31 - 19); - *(out + 12) = base + tmp; - tmp = (*in32 >> 19); - in32++; - /* consumed: 4 bytes (total: 56) */ - tmp |= (*in32 % (1U << 18)) << (31 - 18); - *(out + 13) = base + tmp; - tmp = (*in32 >> 18); - in32++; - /* consumed: 4 bytes (total: 60) */ - tmp |= (*in32 % (1U << 17)) << (31 - 17); - *(out + 14) = base + tmp; - tmp = (*in32 >> 17); - in32++; - /* consumed: 4 bytes (total: 64) */ - tmp |= (*in32 % (1U << 16)) << (31 - 16); - *(out + 15) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 68) */ - tmp |= (*in32 % (1U << 15)) << (31 - 15); - *(out + 16) = base + tmp; - tmp = (*in32 >> 15); - in32++; - /* consumed: 4 bytes (total: 72) */ - tmp |= (*in32 % (1U << 14)) << (31 - 14); - *(out + 17) = base + tmp; - tmp = (*in32 >> 14); - in32++; - /* consumed: 4 bytes (total: 76) */ - tmp |= (*in32 % (1U << 13)) << (31 - 13); - *(out + 18) = base + tmp; - tmp = (*in32 >> 13); - in32++; - /* consumed: 4 bytes (total: 80) */ - tmp |= (*in32 % (1U << 12)) << (31 - 12); - *(out + 19) = base + tmp; - tmp = (*in32 >> 12); - in32++; - /* consumed: 4 bytes (total: 84) */ - tmp |= (*in32 % (1U << 11)) << (31 - 11); - *(out + 20) = base + tmp; - tmp = (*in32 >> 11); - in32++; - /* consumed: 4 bytes (total: 88) */ - tmp |= (*in32 % (1U << 10)) << (31 - 10); - *(out + 21) = base + tmp; - tmp = (*in32 >> 10); - in32++; - /* consumed: 4 bytes (total: 92) */ - tmp |= (*in32 % (1U << 9)) << (31 - 9); - *(out + 22) = base + tmp; - tmp = (*in32 >> 9); - in32++; - /* consumed: 4 bytes (total: 96) */ - tmp |= (*in32 % (1U << 8)) << (31 - 8); - *(out + 23) = base + tmp; - tmp = (*in32 >> 8); - in32++; - /* consumed: 4 bytes (total: 100) */ - tmp |= (*in32 % (1U << 7)) << (31 - 7); - *(out + 24) = base + tmp; - tmp = (*in32 >> 7); - in32++; - /* consumed: 4 bytes (total: 104) */ - tmp |= (*in32 % (1U << 6)) << (31 - 6); - *(out + 25) = base + tmp; - tmp = (*in32 >> 6); - in32++; - /* consumed: 4 bytes (total: 108) */ - tmp |= (*in32 % (1U << 5)) << (31 - 5); - *(out + 26) = base + tmp; - tmp = (*in32 >> 5); - in32++; - /* consumed: 4 bytes (total: 112) */ - tmp |= (*in32 % (1U << 4)) << (31 - 4); - *(out + 27) = base + tmp; - tmp = (*in32 >> 4); - in32++; - /* consumed: 4 bytes (total: 116) */ - tmp |= (*in32 % (1U << 3)) << (31 - 3); - *(out + 28) = base + tmp; - tmp = (*in32 >> 3); - in32++; - /* consumed: 4 bytes (total: 120) */ - tmp |= (*in32 % (1U << 2)) << (31 - 2); - *(out + 29) = base + tmp; - tmp = (*in32 >> 2); - in32++; - /* consumed: 4 bytes (total: 124) */ - tmp |= (*in32 % (1U << 1)) << (31 - 1); - *(out + 30) = base + tmp; - *(out + 31) = base + ((*in32 >> 1) & 2147483647); - /* remaining: 0 bits */ - return 124; -} - -static uint32_t -pack32_32(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t i; - uint32_t *out32 = (uint32_t *)out; - for (i = 0; i < 32; i++) - out32[i] = in[i] - base; - return 32 * sizeof(uint32_t); -} - -static uint32_t -unpack32_32(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t i; - uint32_t *in32 = (uint32_t *)in; - for (i = 0; i < 32; i++) - out[i] = base + in32[i]; - return 32 * sizeof(uint32_t); -} - -for_packfunc_t for_pack32[33] = { - pack0_n, - pack1_32, - pack2_32, - pack3_32, - pack4_32, - pack5_32, - pack6_32, - pack7_32, - pack8_32, - pack9_32, - pack10_32, - pack11_32, - pack12_32, - pack13_32, - pack14_32, - pack15_32, - pack16_32, - pack17_32, - pack18_32, - pack19_32, - pack20_32, - pack21_32, - pack22_32, - pack23_32, - pack24_32, - pack25_32, - pack26_32, - pack27_32, - pack28_32, - pack29_32, - pack30_32, - pack31_32, - pack32_32 -}; - -for_unpackfunc_t for_unpack32[33] = { - unpack0_n, - unpack1_32, - unpack2_32, - unpack3_32, - unpack4_32, - unpack5_32, - unpack6_32, - unpack7_32, - unpack8_32, - unpack9_32, - unpack10_32, - unpack11_32, - unpack12_32, - unpack13_32, - unpack14_32, - unpack15_32, - unpack16_32, - unpack17_32, - unpack18_32, - unpack19_32, - unpack20_32, - unpack21_32, - unpack22_32, - unpack23_32, - unpack24_32, - unpack25_32, - unpack26_32, - unpack27_32, - unpack28_32, - unpack29_32, - unpack30_32, - unpack31_32, - unpack32_32 -}; - -static uint32_t -pack1_16(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 1; - tmp |= (*(in + 2) - base) << 2; - tmp |= (*(in + 3) - base) << 3; - tmp |= (*(in + 4) - base) << 4; - tmp |= (*(in + 5) - base) << 5; - tmp |= (*(in + 6) - base) << 6; - tmp |= (*(in + 7) - base) << 7; - tmp |= (*(in + 8) - base) << 8; - tmp |= (*(in + 9) - base) << 9; - tmp |= (*(in + 10) - base) << 10; - tmp |= (*(in + 11) - base) << 11; - tmp |= (*(in + 12) - base) << 12; - tmp |= (*(in + 13) - base) << 13; - tmp |= (*(in + 14) - base) << 14; - tmp |= (*(in + 15) - base) << 15; - /* remaining: 16 bits */ - length = (32 / 8) - (32 - 16) / 8; - /* consumed: 2 bytes (total: 2) */ - memcpy(out, &tmp, length); - return 2; -} - -static uint32_t -unpack1_16(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 1); - *(out + 1) = base + ((*in32 >> 1) & 1); - *(out + 2) = base + ((*in32 >> 2) & 1); - *(out + 3) = base + ((*in32 >> 3) & 1); - *(out + 4) = base + ((*in32 >> 4) & 1); - *(out + 5) = base + ((*in32 >> 5) & 1); - *(out + 6) = base + ((*in32 >> 6) & 1); - *(out + 7) = base + ((*in32 >> 7) & 1); - *(out + 8) = base + ((*in32 >> 8) & 1); - *(out + 9) = base + ((*in32 >> 9) & 1); - *(out + 10) = base + ((*in32 >> 10) & 1); - *(out + 11) = base + ((*in32 >> 11) & 1); - *(out + 12) = base + ((*in32 >> 12) & 1); - *(out + 13) = base + ((*in32 >> 13) & 1); - *(out + 14) = base + ((*in32 >> 14) & 1); - *(out + 15) = base + ((*in32 >> 15) & 1); - /* remaining: 16 bits */ - return 2; -} - -static uint32_t -pack2_16(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 2; - tmp |= (*(in + 2) - base) << 4; - tmp |= (*(in + 3) - base) << 6; - tmp |= (*(in + 4) - base) << 8; - tmp |= (*(in + 5) - base) << 10; - tmp |= (*(in + 6) - base) << 12; - tmp |= (*(in + 7) - base) << 14; - tmp |= (*(in + 8) - base) << 16; - tmp |= (*(in + 9) - base) << 18; - tmp |= (*(in + 10) - base) << 20; - tmp |= (*(in + 11) - base) << 22; - tmp |= (*(in + 12) - base) << 24; - tmp |= (*(in + 13) - base) << 26; - tmp |= (*(in + 14) - base) << 28; - tmp |= (*(in + 15) - base) << 30; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 4) */ - memcpy(out, &tmp, length); - return 4; -} - -static uint32_t -unpack2_16(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 3); - *(out + 1) = base + ((*in32 >> 2) & 3); - *(out + 2) = base + ((*in32 >> 4) & 3); - *(out + 3) = base + ((*in32 >> 6) & 3); - *(out + 4) = base + ((*in32 >> 8) & 3); - *(out + 5) = base + ((*in32 >> 10) & 3); - *(out + 6) = base + ((*in32 >> 12) & 3); - *(out + 7) = base + ((*in32 >> 14) & 3); - *(out + 8) = base + ((*in32 >> 16) & 3); - *(out + 9) = base + ((*in32 >> 18) & 3); - *(out + 10) = base + ((*in32 >> 20) & 3); - *(out + 11) = base + ((*in32 >> 22) & 3); - *(out + 12) = base + ((*in32 >> 24) & 3); - *(out + 13) = base + ((*in32 >> 26) & 3); - *(out + 14) = base + ((*in32 >> 28) & 3); - *(out + 15) = base + ((*in32 >> 30) & 3); - /* remaining: 0 bits */ - return 4; -} - -static uint32_t -pack3_16(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 3; - tmp |= (*(in + 2) - base) << 6; - tmp |= (*(in + 3) - base) << 9; - tmp |= (*(in + 4) - base) << 12; - tmp |= (*(in + 5) - base) << 15; - tmp |= (*(in + 6) - base) << 18; - tmp |= (*(in + 7) - base) << 21; - tmp |= (*(in + 8) - base) << 24; - tmp |= (*(in + 9) - base) << 27; - tmp |= (*(in + 10) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 10) - base) >> (3 - 1); - tmp |= (*(in + 11) - base) << 1; - tmp |= (*(in + 12) - base) << 4; - tmp |= (*(in + 13) - base) << 7; - tmp |= (*(in + 14) - base) << 10; - tmp |= (*(in + 15) - base) << 13; - /* remaining: 16 bits */ - length = (32 / 8) - (32 - 16) / 8; - /* consumed: 2 bytes (total: 6) */ - memcpy(out, &tmp, length); - return 6; -} - -static uint32_t -unpack3_16(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 7); - *(out + 1) = base + ((*in32 >> 3) & 7); - *(out + 2) = base + ((*in32 >> 6) & 7); - *(out + 3) = base + ((*in32 >> 9) & 7); - *(out + 4) = base + ((*in32 >> 12) & 7); - *(out + 5) = base + ((*in32 >> 15) & 7); - *(out + 6) = base + ((*in32 >> 18) & 7); - *(out + 7) = base + ((*in32 >> 21) & 7); - *(out + 8) = base + ((*in32 >> 24) & 7); - *(out + 9) = base + ((*in32 >> 27) & 7); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 1)) << (3 - 1); - *(out + 10) = base + tmp; - *(out + 11) = base + ((*in32 >> 1) & 7); - *(out + 12) = base + ((*in32 >> 4) & 7); - *(out + 13) = base + ((*in32 >> 7) & 7); - *(out + 14) = base + ((*in32 >> 10) & 7); - *(out + 15) = base + ((*in32 >> 13) & 7); - /* remaining: 16 bits */ - return 6; -} - -static uint32_t -pack4_16(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 4; - tmp |= (*(in + 2) - base) << 8; - tmp |= (*(in + 3) - base) << 12; - tmp |= (*(in + 4) - base) << 16; - tmp |= (*(in + 5) - base) << 20; - tmp |= (*(in + 6) - base) << 24; - tmp |= (*(in + 7) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 8) - base) << 0; - tmp |= (*(in + 9) - base) << 4; - tmp |= (*(in + 10) - base) << 8; - tmp |= (*(in + 11) - base) << 12; - tmp |= (*(in + 12) - base) << 16; - tmp |= (*(in + 13) - base) << 20; - tmp |= (*(in + 14) - base) << 24; - tmp |= (*(in + 15) - base) << 28; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 8) */ - memcpy(out, &tmp, length); - return 8; -} - -static uint32_t -unpack4_16(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 15); - *(out + 1) = base + ((*in32 >> 4) & 15); - *(out + 2) = base + ((*in32 >> 8) & 15); - *(out + 3) = base + ((*in32 >> 12) & 15); - *(out + 4) = base + ((*in32 >> 16) & 15); - *(out + 5) = base + ((*in32 >> 20) & 15); - *(out + 6) = base + ((*in32 >> 24) & 15); - *(out + 7) = base + ((*in32 >> 28) & 15); - in32++; - /* consumed: 4 bytes (total: 8) */ - *(out + 8) = base + ((*in32 >> 0) & 15); - *(out + 9) = base + ((*in32 >> 4) & 15); - *(out + 10) = base + ((*in32 >> 8) & 15); - *(out + 11) = base + ((*in32 >> 12) & 15); - *(out + 12) = base + ((*in32 >> 16) & 15); - *(out + 13) = base + ((*in32 >> 20) & 15); - *(out + 14) = base + ((*in32 >> 24) & 15); - *(out + 15) = base + ((*in32 >> 28) & 15); - /* remaining: 0 bits */ - return 8; -} - -static uint32_t -pack5_16(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 5; - tmp |= (*(in + 2) - base) << 10; - tmp |= (*(in + 3) - base) << 15; - tmp |= (*(in + 4) - base) << 20; - tmp |= (*(in + 5) - base) << 25; - tmp |= (*(in + 6) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 6) - base) >> (5 - 3); - tmp |= (*(in + 7) - base) << 3; - tmp |= (*(in + 8) - base) << 8; - tmp |= (*(in + 9) - base) << 13; - tmp |= (*(in + 10) - base) << 18; - tmp |= (*(in + 11) - base) << 23; - tmp |= (*(in + 12) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 12) - base) >> (5 - 1); - tmp |= (*(in + 13) - base) << 1; - tmp |= (*(in + 14) - base) << 6; - tmp |= (*(in + 15) - base) << 11; - /* remaining: 16 bits */ - length = (32 / 8) - (32 - 16) / 8; - /* consumed: 2 bytes (total: 10) */ - memcpy(out, &tmp, length); - return 10; -} - -static uint32_t -unpack5_16(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 31); - *(out + 1) = base + ((*in32 >> 5) & 31); - *(out + 2) = base + ((*in32 >> 10) & 31); - *(out + 3) = base + ((*in32 >> 15) & 31); - *(out + 4) = base + ((*in32 >> 20) & 31); - *(out + 5) = base + ((*in32 >> 25) & 31); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 3)) << (5 - 3); - *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 3) & 31); - *(out + 8) = base + ((*in32 >> 8) & 31); - *(out + 9) = base + ((*in32 >> 13) & 31); - *(out + 10) = base + ((*in32 >> 18) & 31); - *(out + 11) = base + ((*in32 >> 23) & 31); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 1)) << (5 - 1); - *(out + 12) = base + tmp; - *(out + 13) = base + ((*in32 >> 1) & 31); - *(out + 14) = base + ((*in32 >> 6) & 31); - *(out + 15) = base + ((*in32 >> 11) & 31); - /* remaining: 16 bits */ - return 10; -} - -static uint32_t -pack6_16(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 6; - tmp |= (*(in + 2) - base) << 12; - tmp |= (*(in + 3) - base) << 18; - tmp |= (*(in + 4) - base) << 24; - tmp |= (*(in + 5) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 5) - base) >> (6 - 4); - tmp |= (*(in + 6) - base) << 4; - tmp |= (*(in + 7) - base) << 10; - tmp |= (*(in + 8) - base) << 16; - tmp |= (*(in + 9) - base) << 22; - tmp |= (*(in + 10) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 10) - base) >> (6 - 2); - tmp |= (*(in + 11) - base) << 2; - tmp |= (*(in + 12) - base) << 8; - tmp |= (*(in + 13) - base) << 14; - tmp |= (*(in + 14) - base) << 20; - tmp |= (*(in + 15) - base) << 26; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 12) */ - memcpy(out, &tmp, length); - return 12; -} - -static uint32_t -unpack6_16(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 63); - *(out + 1) = base + ((*in32 >> 6) & 63); - *(out + 2) = base + ((*in32 >> 12) & 63); - *(out + 3) = base + ((*in32 >> 18) & 63); - *(out + 4) = base + ((*in32 >> 24) & 63); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 4)) << (6 - 4); - *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 4) & 63); - *(out + 7) = base + ((*in32 >> 10) & 63); - *(out + 8) = base + ((*in32 >> 16) & 63); - *(out + 9) = base + ((*in32 >> 22) & 63); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 2)) << (6 - 2); - *(out + 10) = base + tmp; - *(out + 11) = base + ((*in32 >> 2) & 63); - *(out + 12) = base + ((*in32 >> 8) & 63); - *(out + 13) = base + ((*in32 >> 14) & 63); - *(out + 14) = base + ((*in32 >> 20) & 63); - *(out + 15) = base + ((*in32 >> 26) & 63); - /* remaining: 0 bits */ - return 12; -} - -static uint32_t -pack7_16(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 7; - tmp |= (*(in + 2) - base) << 14; - tmp |= (*(in + 3) - base) << 21; - tmp |= (*(in + 4) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 4) - base) >> (7 - 3); - tmp |= (*(in + 5) - base) << 3; - tmp |= (*(in + 6) - base) << 10; - tmp |= (*(in + 7) - base) << 17; - tmp |= (*(in + 8) - base) << 24; - tmp |= (*(in + 9) - base) << 31; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 9) - base) >> (7 - 6); - tmp |= (*(in + 10) - base) << 6; - tmp |= (*(in + 11) - base) << 13; - tmp |= (*(in + 12) - base) << 20; - tmp |= (*(in + 13) - base) << 27; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 13) - base) >> (7 - 2); - tmp |= (*(in + 14) - base) << 2; - tmp |= (*(in + 15) - base) << 9; - /* remaining: 16 bits */ - length = (32 / 8) - (32 - 16) / 8; - /* consumed: 2 bytes (total: 14) */ - memcpy(out, &tmp, length); - return 14; -} - -static uint32_t -unpack7_16(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 127); - *(out + 1) = base + ((*in32 >> 7) & 127); - *(out + 2) = base + ((*in32 >> 14) & 127); - *(out + 3) = base + ((*in32 >> 21) & 127); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 3)) << (7 - 3); - *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 3) & 127); - *(out + 6) = base + ((*in32 >> 10) & 127); - *(out + 7) = base + ((*in32 >> 17) & 127); - *(out + 8) = base + ((*in32 >> 24) & 127); - tmp = (*in32 >> 31); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 6)) << (7 - 6); - *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 6) & 127); - *(out + 11) = base + ((*in32 >> 13) & 127); - *(out + 12) = base + ((*in32 >> 20) & 127); - tmp = (*in32 >> 27); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 2)) << (7 - 2); - *(out + 13) = base + tmp; - *(out + 14) = base + ((*in32 >> 2) & 127); - *(out + 15) = base + ((*in32 >> 9) & 127); - /* remaining: 16 bits */ - return 14; -} - -static uint32_t -pack8_16(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 8; - tmp |= (*(in + 2) - base) << 16; - tmp |= (*(in + 3) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 4) - base) << 0; - tmp |= (*(in + 5) - base) << 8; - tmp |= (*(in + 6) - base) << 16; - tmp |= (*(in + 7) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 8) - base) << 0; - tmp |= (*(in + 9) - base) << 8; - tmp |= (*(in + 10) - base) << 16; - tmp |= (*(in + 11) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 12) - base) << 0; - tmp |= (*(in + 13) - base) << 8; - tmp |= (*(in + 14) - base) << 16; - tmp |= (*(in + 15) - base) << 24; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 16) */ - memcpy(out, &tmp, length); - return 16; -} - -static uint32_t -unpack8_16(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 255); - *(out + 1) = base + ((*in32 >> 8) & 255); - *(out + 2) = base + ((*in32 >> 16) & 255); - *(out + 3) = base + ((*in32 >> 24) & 255); - in32++; - /* consumed: 4 bytes (total: 8) */ - *(out + 4) = base + ((*in32 >> 0) & 255); - *(out + 5) = base + ((*in32 >> 8) & 255); - *(out + 6) = base + ((*in32 >> 16) & 255); - *(out + 7) = base + ((*in32 >> 24) & 255); - in32++; - /* consumed: 4 bytes (total: 12) */ - *(out + 8) = base + ((*in32 >> 0) & 255); - *(out + 9) = base + ((*in32 >> 8) & 255); - *(out + 10) = base + ((*in32 >> 16) & 255); - *(out + 11) = base + ((*in32 >> 24) & 255); - in32++; - /* consumed: 4 bytes (total: 16) */ - *(out + 12) = base + ((*in32 >> 0) & 255); - *(out + 13) = base + ((*in32 >> 8) & 255); - *(out + 14) = base + ((*in32 >> 16) & 255); - *(out + 15) = base + ((*in32 >> 24) & 255); - /* remaining: 0 bits */ - return 16; -} - -static uint32_t -pack9_16(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 9; - tmp |= (*(in + 2) - base) << 18; - tmp |= (*(in + 3) - base) << 27; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 3) - base) >> (9 - 4); - tmp |= (*(in + 4) - base) << 4; - tmp |= (*(in + 5) - base) << 13; - tmp |= (*(in + 6) - base) << 22; - tmp |= (*(in + 7) - base) << 31; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 7) - base) >> (9 - 8); - tmp |= (*(in + 8) - base) << 8; - tmp |= (*(in + 9) - base) << 17; - tmp |= (*(in + 10) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 10) - base) >> (9 - 3); - tmp |= (*(in + 11) - base) << 3; - tmp |= (*(in + 12) - base) << 12; - tmp |= (*(in + 13) - base) << 21; - tmp |= (*(in + 14) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 14) - base) >> (9 - 7); - tmp |= (*(in + 15) - base) << 7; - /* remaining: 16 bits */ - length = (32 / 8) - (32 - 16) / 8; - /* consumed: 2 bytes (total: 18) */ - memcpy(out, &tmp, length); - return 18; -} - -static uint32_t -unpack9_16(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 511); - *(out + 1) = base + ((*in32 >> 9) & 511); - *(out + 2) = base + ((*in32 >> 18) & 511); - tmp = (*in32 >> 27); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 4)) << (9 - 4); - *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 4) & 511); - *(out + 5) = base + ((*in32 >> 13) & 511); - *(out + 6) = base + ((*in32 >> 22) & 511); - tmp = (*in32 >> 31); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 8)) << (9 - 8); - *(out + 7) = base + tmp; - *(out + 8) = base + ((*in32 >> 8) & 511); - *(out + 9) = base + ((*in32 >> 17) & 511); - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 3)) << (9 - 3); - *(out + 10) = base + tmp; - *(out + 11) = base + ((*in32 >> 3) & 511); - *(out + 12) = base + ((*in32 >> 12) & 511); - *(out + 13) = base + ((*in32 >> 21) & 511); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 7)) << (9 - 7); - *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 7) & 511); - /* remaining: 16 bits */ - return 18; -} - -static uint32_t -pack10_16(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 10; - tmp |= (*(in + 2) - base) << 20; - tmp |= (*(in + 3) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 3) - base) >> (10 - 8); - tmp |= (*(in + 4) - base) << 8; - tmp |= (*(in + 5) - base) << 18; - tmp |= (*(in + 6) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 6) - base) >> (10 - 6); - tmp |= (*(in + 7) - base) << 6; - tmp |= (*(in + 8) - base) << 16; - tmp |= (*(in + 9) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 9) - base) >> (10 - 4); - tmp |= (*(in + 10) - base) << 4; - tmp |= (*(in + 11) - base) << 14; - tmp |= (*(in + 12) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 12) - base) >> (10 - 2); - tmp |= (*(in + 13) - base) << 2; - tmp |= (*(in + 14) - base) << 12; - tmp |= (*(in + 15) - base) << 22; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 20) */ - memcpy(out, &tmp, length); - return 20; -} - -static uint32_t -unpack10_16(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 1023); - *(out + 1) = base + ((*in32 >> 10) & 1023); - *(out + 2) = base + ((*in32 >> 20) & 1023); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 8)) << (10 - 8); - *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 8) & 1023); - *(out + 5) = base + ((*in32 >> 18) & 1023); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 6)) << (10 - 6); - *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 6) & 1023); - *(out + 8) = base + ((*in32 >> 16) & 1023); - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 4)) << (10 - 4); - *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 4) & 1023); - *(out + 11) = base + ((*in32 >> 14) & 1023); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 2)) << (10 - 2); - *(out + 12) = base + tmp; - *(out + 13) = base + ((*in32 >> 2) & 1023); - *(out + 14) = base + ((*in32 >> 12) & 1023); - *(out + 15) = base + ((*in32 >> 22) & 1023); - /* remaining: 0 bits */ - return 20; -} - -static uint32_t -pack11_16(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 11; - tmp |= (*(in + 2) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) >> (11 - 1); - tmp |= (*(in + 3) - base) << 1; - tmp |= (*(in + 4) - base) << 12; - tmp |= (*(in + 5) - base) << 23; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 5) - base) >> (11 - 2); - tmp |= (*(in + 6) - base) << 2; - tmp |= (*(in + 7) - base) << 13; - tmp |= (*(in + 8) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 8) - base) >> (11 - 3); - tmp |= (*(in + 9) - base) << 3; - tmp |= (*(in + 10) - base) << 14; - tmp |= (*(in + 11) - base) << 25; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 11) - base) >> (11 - 4); - tmp |= (*(in + 12) - base) << 4; - tmp |= (*(in + 13) - base) << 15; - tmp |= (*(in + 14) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 14) - base) >> (11 - 5); - tmp |= (*(in + 15) - base) << 5; - /* remaining: 16 bits */ - length = (32 / 8) - (32 - 16) / 8; - /* consumed: 2 bytes (total: 22) */ - memcpy(out, &tmp, length); - return 22; -} - -static uint32_t -unpack11_16(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 2047); - *(out + 1) = base + ((*in32 >> 11) & 2047); - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 1)) << (11 - 1); - *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 1) & 2047); - *(out + 4) = base + ((*in32 >> 12) & 2047); - tmp = (*in32 >> 23); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 2)) << (11 - 2); - *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 2) & 2047); - *(out + 7) = base + ((*in32 >> 13) & 2047); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 3)) << (11 - 3); - *(out + 8) = base + tmp; - *(out + 9) = base + ((*in32 >> 3) & 2047); - *(out + 10) = base + ((*in32 >> 14) & 2047); - tmp = (*in32 >> 25); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 4)) << (11 - 4); - *(out + 11) = base + tmp; - *(out + 12) = base + ((*in32 >> 4) & 2047); - *(out + 13) = base + ((*in32 >> 15) & 2047); - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 5)) << (11 - 5); - *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 5) & 2047); - /* remaining: 16 bits */ - return 22; -} - -static uint32_t -pack12_16(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 12; - tmp |= (*(in + 2) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) >> (12 - 4); - tmp |= (*(in + 3) - base) << 4; - tmp |= (*(in + 4) - base) << 16; - tmp |= (*(in + 5) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 5) - base) >> (12 - 8); - tmp |= (*(in + 6) - base) << 8; - tmp |= (*(in + 7) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 8) - base) << 0; - tmp |= (*(in + 9) - base) << 12; - tmp |= (*(in + 10) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 10) - base) >> (12 - 4); - tmp |= (*(in + 11) - base) << 4; - tmp |= (*(in + 12) - base) << 16; - tmp |= (*(in + 13) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 13) - base) >> (12 - 8); - tmp |= (*(in + 14) - base) << 8; - tmp |= (*(in + 15) - base) << 20; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 24) */ - memcpy(out, &tmp, length); - return 24; -} - -static uint32_t -unpack12_16(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 4095); - *(out + 1) = base + ((*in32 >> 12) & 4095); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 4)) << (12 - 4); - *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 4) & 4095); - *(out + 4) = base + ((*in32 >> 16) & 4095); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 8)) << (12 - 8); - *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 8) & 4095); - *(out + 7) = base + ((*in32 >> 20) & 4095); - in32++; - /* consumed: 4 bytes (total: 16) */ - *(out + 8) = base + ((*in32 >> 0) & 4095); - *(out + 9) = base + ((*in32 >> 12) & 4095); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 4)) << (12 - 4); - *(out + 10) = base + tmp; - *(out + 11) = base + ((*in32 >> 4) & 4095); - *(out + 12) = base + ((*in32 >> 16) & 4095); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 8)) << (12 - 8); - *(out + 13) = base + tmp; - *(out + 14) = base + ((*in32 >> 8) & 4095); - *(out + 15) = base + ((*in32 >> 20) & 4095); - /* remaining: 0 bits */ - return 24; -} - -static uint32_t -pack13_16(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 13; - tmp |= (*(in + 2) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) >> (13 - 7); - tmp |= (*(in + 3) - base) << 7; - tmp |= (*(in + 4) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 4) - base) >> (13 - 1); - tmp |= (*(in + 5) - base) << 1; - tmp |= (*(in + 6) - base) << 14; - tmp |= (*(in + 7) - base) << 27; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 7) - base) >> (13 - 8); - tmp |= (*(in + 8) - base) << 8; - tmp |= (*(in + 9) - base) << 21; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 9) - base) >> (13 - 2); - tmp |= (*(in + 10) - base) << 2; - tmp |= (*(in + 11) - base) << 15; - tmp |= (*(in + 12) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 12) - base) >> (13 - 9); - tmp |= (*(in + 13) - base) << 9; - tmp |= (*(in + 14) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 14) - base) >> (13 - 3); - tmp |= (*(in + 15) - base) << 3; - /* remaining: 16 bits */ - length = (32 / 8) - (32 - 16) / 8; - /* consumed: 2 bytes (total: 26) */ - memcpy(out, &tmp, length); - return 26; -} - -static uint32_t -unpack13_16(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 8191); - *(out + 1) = base + ((*in32 >> 13) & 8191); - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 7)) << (13 - 7); - *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 7) & 8191); - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 1)) << (13 - 1); - *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 1) & 8191); - *(out + 6) = base + ((*in32 >> 14) & 8191); - tmp = (*in32 >> 27); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 8)) << (13 - 8); - *(out + 7) = base + tmp; - *(out + 8) = base + ((*in32 >> 8) & 8191); - tmp = (*in32 >> 21); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 2)) << (13 - 2); - *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 2) & 8191); - *(out + 11) = base + ((*in32 >> 15) & 8191); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 9)) << (13 - 9); - *(out + 12) = base + tmp; - *(out + 13) = base + ((*in32 >> 9) & 8191); - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 3)) << (13 - 3); - *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 3) & 8191); - /* remaining: 16 bits */ - return 26; -} - -static uint32_t -pack14_16(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 14; - tmp |= (*(in + 2) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) >> (14 - 10); - tmp |= (*(in + 3) - base) << 10; - tmp |= (*(in + 4) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 4) - base) >> (14 - 6); - tmp |= (*(in + 5) - base) << 6; - tmp |= (*(in + 6) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 6) - base) >> (14 - 2); - tmp |= (*(in + 7) - base) << 2; - tmp |= (*(in + 8) - base) << 16; - tmp |= (*(in + 9) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 9) - base) >> (14 - 12); - tmp |= (*(in + 10) - base) << 12; - tmp |= (*(in + 11) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 11) - base) >> (14 - 8); - tmp |= (*(in + 12) - base) << 8; - tmp |= (*(in + 13) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 13) - base) >> (14 - 4); - tmp |= (*(in + 14) - base) << 4; - tmp |= (*(in + 15) - base) << 18; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 28) */ - memcpy(out, &tmp, length); - return 28; -} - -static uint32_t -unpack14_16(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 16383); - *(out + 1) = base + ((*in32 >> 14) & 16383); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 10)) << (14 - 10); - *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 10) & 16383); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 6)) << (14 - 6); - *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 6) & 16383); - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 2)) << (14 - 2); - *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 2) & 16383); - *(out + 8) = base + ((*in32 >> 16) & 16383); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 12)) << (14 - 12); - *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 12) & 16383); - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 8)) << (14 - 8); - *(out + 11) = base + tmp; - *(out + 12) = base + ((*in32 >> 8) & 16383); - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 4)) << (14 - 4); - *(out + 13) = base + tmp; - *(out + 14) = base + ((*in32 >> 4) & 16383); - *(out + 15) = base + ((*in32 >> 18) & 16383); - /* remaining: 0 bits */ - return 28; -} - -static uint32_t -pack15_16(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 15; - tmp |= (*(in + 2) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) >> (15 - 13); - tmp |= (*(in + 3) - base) << 13; - tmp |= (*(in + 4) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 4) - base) >> (15 - 11); - tmp |= (*(in + 5) - base) << 11; - tmp |= (*(in + 6) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 6) - base) >> (15 - 9); - tmp |= (*(in + 7) - base) << 9; - tmp |= (*(in + 8) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 8) - base) >> (15 - 7); - tmp |= (*(in + 9) - base) << 7; - tmp |= (*(in + 10) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 10) - base) >> (15 - 5); - tmp |= (*(in + 11) - base) << 5; - tmp |= (*(in + 12) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 12) - base) >> (15 - 3); - tmp |= (*(in + 13) - base) << 3; - tmp |= (*(in + 14) - base) << 18; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 14) - base) >> (15 - 1); - tmp |= (*(in + 15) - base) << 1; - /* remaining: 16 bits */ - length = (32 / 8) - (32 - 16) / 8; - /* consumed: 2 bytes (total: 30) */ - memcpy(out, &tmp, length); - return 30; -} - -static uint32_t -unpack15_16(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 32767); - *(out + 1) = base + ((*in32 >> 15) & 32767); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 13)) << (15 - 13); - *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 13) & 32767); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 11)) << (15 - 11); - *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 11) & 32767); - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 9)) << (15 - 9); - *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 9) & 32767); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 7)) << (15 - 7); - *(out + 8) = base + tmp; - *(out + 9) = base + ((*in32 >> 7) & 32767); - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 5)) << (15 - 5); - *(out + 10) = base + tmp; - *(out + 11) = base + ((*in32 >> 5) & 32767); - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 3)) << (15 - 3); - *(out + 12) = base + tmp; - *(out + 13) = base + ((*in32 >> 3) & 32767); - tmp = (*in32 >> 18); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 1)) << (15 - 1); - *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 1) & 32767); - /* remaining: 16 bits */ - return 30; -} - -static uint32_t -pack16_16(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) << 0; - tmp |= (*(in + 3) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 4) - base) << 0; - tmp |= (*(in + 5) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 6) - base) << 0; - tmp |= (*(in + 7) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 8) - base) << 0; - tmp |= (*(in + 9) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 10) - base) << 0; - tmp |= (*(in + 11) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 12) - base) << 0; - tmp |= (*(in + 13) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 14) - base) << 0; - tmp |= (*(in + 15) - base) << 16; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 32) */ - memcpy(out, &tmp, length); - return 32; -} - -static uint32_t -unpack16_16(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 65535); - *(out + 1) = base + ((*in32 >> 16) & 65535); - in32++; - /* consumed: 4 bytes (total: 8) */ - *(out + 2) = base + ((*in32 >> 0) & 65535); - *(out + 3) = base + ((*in32 >> 16) & 65535); - in32++; - /* consumed: 4 bytes (total: 12) */ - *(out + 4) = base + ((*in32 >> 0) & 65535); - *(out + 5) = base + ((*in32 >> 16) & 65535); - in32++; - /* consumed: 4 bytes (total: 16) */ - *(out + 6) = base + ((*in32 >> 0) & 65535); - *(out + 7) = base + ((*in32 >> 16) & 65535); - in32++; - /* consumed: 4 bytes (total: 20) */ - *(out + 8) = base + ((*in32 >> 0) & 65535); - *(out + 9) = base + ((*in32 >> 16) & 65535); - in32++; - /* consumed: 4 bytes (total: 24) */ - *(out + 10) = base + ((*in32 >> 0) & 65535); - *(out + 11) = base + ((*in32 >> 16) & 65535); - in32++; - /* consumed: 4 bytes (total: 28) */ - *(out + 12) = base + ((*in32 >> 0) & 65535); - *(out + 13) = base + ((*in32 >> 16) & 65535); - in32++; - /* consumed: 4 bytes (total: 32) */ - *(out + 14) = base + ((*in32 >> 0) & 65535); - *(out + 15) = base + ((*in32 >> 16) & 65535); - /* remaining: 0 bits */ - return 32; -} - -static uint32_t -pack17_16(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 17; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (17 - 2); - tmp |= (*(in + 2) - base) << 2; - tmp |= (*(in + 3) - base) << 19; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 3) - base) >> (17 - 4); - tmp |= (*(in + 4) - base) << 4; - tmp |= (*(in + 5) - base) << 21; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 5) - base) >> (17 - 6); - tmp |= (*(in + 6) - base) << 6; - tmp |= (*(in + 7) - base) << 23; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 7) - base) >> (17 - 8); - tmp |= (*(in + 8) - base) << 8; - tmp |= (*(in + 9) - base) << 25; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 9) - base) >> (17 - 10); - tmp |= (*(in + 10) - base) << 10; - tmp |= (*(in + 11) - base) << 27; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 11) - base) >> (17 - 12); - tmp |= (*(in + 12) - base) << 12; - tmp |= (*(in + 13) - base) << 29; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 13) - base) >> (17 - 14); - tmp |= (*(in + 14) - base) << 14; - tmp |= (*(in + 15) - base) << 31; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 15) - base) >> (17 - 16); - /* remaining: 16 bits */ - length = (32 / 8) - (32 - 16) / 8; - /* consumed: 2 bytes (total: 34) */ - memcpy(out, &tmp, length); - return 34; -} - -static uint32_t -unpack17_16(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 131071); - tmp = (*in32 >> 17); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 2)) << (17 - 2); - *(out + 1) = base + tmp; - *(out + 2) = base + ((*in32 >> 2) & 131071); - tmp = (*in32 >> 19); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 4)) << (17 - 4); - *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 4) & 131071); - tmp = (*in32 >> 21); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 6)) << (17 - 6); - *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 6) & 131071); - tmp = (*in32 >> 23); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 8)) << (17 - 8); - *(out + 7) = base + tmp; - *(out + 8) = base + ((*in32 >> 8) & 131071); - tmp = (*in32 >> 25); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 10)) << (17 - 10); - *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 10) & 131071); - tmp = (*in32 >> 27); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 12)) << (17 - 12); - *(out + 11) = base + tmp; - *(out + 12) = base + ((*in32 >> 12) & 131071); - tmp = (*in32 >> 29); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 14)) << (17 - 14); - *(out + 13) = base + tmp; - *(out + 14) = base + ((*in32 >> 14) & 131071); - tmp = (*in32 >> 31); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 16)) << (17 - 16); - *(out + 15) = base + tmp; - /* remaining: 16 bits */ - return 34; -} - -static uint32_t -pack18_16(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 18; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (18 - 4); - tmp |= (*(in + 2) - base) << 4; - tmp |= (*(in + 3) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 3) - base) >> (18 - 8); - tmp |= (*(in + 4) - base) << 8; - tmp |= (*(in + 5) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 5) - base) >> (18 - 12); - tmp |= (*(in + 6) - base) << 12; - tmp |= (*(in + 7) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 7) - base) >> (18 - 16); - tmp |= (*(in + 8) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 8) - base) >> (18 - 2); - tmp |= (*(in + 9) - base) << 2; - tmp |= (*(in + 10) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 10) - base) >> (18 - 6); - tmp |= (*(in + 11) - base) << 6; - tmp |= (*(in + 12) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 12) - base) >> (18 - 10); - tmp |= (*(in + 13) - base) << 10; - tmp |= (*(in + 14) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 14) - base) >> (18 - 14); - tmp |= (*(in + 15) - base) << 14; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 36) */ - memcpy(out, &tmp, length); - return 36; -} - -static uint32_t -unpack18_16(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 262143); - tmp = (*in32 >> 18); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 4)) << (18 - 4); - *(out + 1) = base + tmp; - *(out + 2) = base + ((*in32 >> 4) & 262143); - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 8)) << (18 - 8); - *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 8) & 262143); - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 12)) << (18 - 12); - *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 12) & 262143); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 16)) << (18 - 16); - *(out + 7) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 2)) << (18 - 2); - *(out + 8) = base + tmp; - *(out + 9) = base + ((*in32 >> 2) & 262143); - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 6)) << (18 - 6); - *(out + 10) = base + tmp; - *(out + 11) = base + ((*in32 >> 6) & 262143); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 10)) << (18 - 10); - *(out + 12) = base + tmp; - *(out + 13) = base + ((*in32 >> 10) & 262143); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 14)) << (18 - 14); - *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 14) & 262143); - /* remaining: 0 bits */ - return 36; -} - -static uint32_t -pack19_16(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 19; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (19 - 6); - tmp |= (*(in + 2) - base) << 6; - tmp |= (*(in + 3) - base) << 25; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 3) - base) >> (19 - 12); - tmp |= (*(in + 4) - base) << 12; - tmp |= (*(in + 5) - base) << 31; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 5) - base) >> (19 - 18); - tmp |= (*(in + 6) - base) << 18; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 6) - base) >> (19 - 5); - tmp |= (*(in + 7) - base) << 5; - tmp |= (*(in + 8) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 8) - base) >> (19 - 11); - tmp |= (*(in + 9) - base) << 11; - tmp |= (*(in + 10) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 10) - base) >> (19 - 17); - tmp |= (*(in + 11) - base) << 17; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 11) - base) >> (19 - 4); - tmp |= (*(in + 12) - base) << 4; - tmp |= (*(in + 13) - base) << 23; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 13) - base) >> (19 - 10); - tmp |= (*(in + 14) - base) << 10; - tmp |= (*(in + 15) - base) << 29; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 15) - base) >> (19 - 16); - /* remaining: 16 bits */ - length = (32 / 8) - (32 - 16) / 8; - /* consumed: 2 bytes (total: 38) */ - memcpy(out, &tmp, length); - return 38; -} - -static uint32_t -unpack19_16(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 524287); - tmp = (*in32 >> 19); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 6)) << (19 - 6); - *(out + 1) = base + tmp; - *(out + 2) = base + ((*in32 >> 6) & 524287); - tmp = (*in32 >> 25); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 12)) << (19 - 12); - *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 12) & 524287); - tmp = (*in32 >> 31); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 18)) << (19 - 18); - *(out + 5) = base + tmp; - tmp = (*in32 >> 18); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 5)) << (19 - 5); - *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 5) & 524287); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 11)) << (19 - 11); - *(out + 8) = base + tmp; - *(out + 9) = base + ((*in32 >> 11) & 524287); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 17)) << (19 - 17); - *(out + 10) = base + tmp; - tmp = (*in32 >> 17); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 4)) << (19 - 4); - *(out + 11) = base + tmp; - *(out + 12) = base + ((*in32 >> 4) & 524287); - tmp = (*in32 >> 23); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 10)) << (19 - 10); - *(out + 13) = base + tmp; - *(out + 14) = base + ((*in32 >> 10) & 524287); - tmp = (*in32 >> 29); - in32++; - /* consumed: 4 bytes (total: 40) */ - tmp |= (*in32 % (1U << 16)) << (19 - 16); - *(out + 15) = base + tmp; - /* remaining: 16 bits */ - return 38; -} - -static uint32_t -pack20_16(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (20 - 8); - tmp |= (*(in + 2) - base) << 8; - tmp |= (*(in + 3) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 3) - base) >> (20 - 16); - tmp |= (*(in + 4) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 4) - base) >> (20 - 4); - tmp |= (*(in + 5) - base) << 4; - tmp |= (*(in + 6) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 6) - base) >> (20 - 12); - tmp |= (*(in + 7) - base) << 12; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 8) - base) << 0; - tmp |= (*(in + 9) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 9) - base) >> (20 - 8); - tmp |= (*(in + 10) - base) << 8; - tmp |= (*(in + 11) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 11) - base) >> (20 - 16); - tmp |= (*(in + 12) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 12) - base) >> (20 - 4); - tmp |= (*(in + 13) - base) << 4; - tmp |= (*(in + 14) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 14) - base) >> (20 - 12); - tmp |= (*(in + 15) - base) << 12; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 40) */ - memcpy(out, &tmp, length); - return 40; -} - -static uint32_t -unpack20_16(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 1048575); - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 8)) << (20 - 8); - *(out + 1) = base + tmp; - *(out + 2) = base + ((*in32 >> 8) & 1048575); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 16)) << (20 - 16); - *(out + 3) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 4)) << (20 - 4); - *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 4) & 1048575); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 12)) << (20 - 12); - *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 12) & 1048575); - in32++; - /* consumed: 4 bytes (total: 24) */ - *(out + 8) = base + ((*in32 >> 0) & 1048575); - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 8)) << (20 - 8); - *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 8) & 1048575); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 16)) << (20 - 16); - *(out + 11) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 4)) << (20 - 4); - *(out + 12) = base + tmp; - *(out + 13) = base + ((*in32 >> 4) & 1048575); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 40) */ - tmp |= (*in32 % (1U << 12)) << (20 - 12); - *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 12) & 1048575); - /* remaining: 0 bits */ - return 40; -} - -static uint32_t -pack21_16(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 21; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (21 - 10); - tmp |= (*(in + 2) - base) << 10; - tmp |= (*(in + 3) - base) << 31; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 3) - base) >> (21 - 20); - tmp |= (*(in + 4) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 4) - base) >> (21 - 9); - tmp |= (*(in + 5) - base) << 9; - tmp |= (*(in + 6) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 6) - base) >> (21 - 19); - tmp |= (*(in + 7) - base) << 19; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 7) - base) >> (21 - 8); - tmp |= (*(in + 8) - base) << 8; - tmp |= (*(in + 9) - base) << 29; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 9) - base) >> (21 - 18); - tmp |= (*(in + 10) - base) << 18; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 10) - base) >> (21 - 7); - tmp |= (*(in + 11) - base) << 7; - tmp |= (*(in + 12) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 12) - base) >> (21 - 17); - tmp |= (*(in + 13) - base) << 17; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 13) - base) >> (21 - 6); - tmp |= (*(in + 14) - base) << 6; - tmp |= (*(in + 15) - base) << 27; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 15) - base) >> (21 - 16); - /* remaining: 16 bits */ - length = (32 / 8) - (32 - 16) / 8; - /* consumed: 2 bytes (total: 42) */ - memcpy(out, &tmp, length); - return 42; -} - -static uint32_t -unpack21_16(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 2097151); - tmp = (*in32 >> 21); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 10)) << (21 - 10); - *(out + 1) = base + tmp; - *(out + 2) = base + ((*in32 >> 10) & 2097151); - tmp = (*in32 >> 31); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 20)) << (21 - 20); - *(out + 3) = base + tmp; - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 9)) << (21 - 9); - *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 9) & 2097151); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 19)) << (21 - 19); - *(out + 6) = base + tmp; - tmp = (*in32 >> 19); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 8)) << (21 - 8); - *(out + 7) = base + tmp; - *(out + 8) = base + ((*in32 >> 8) & 2097151); - tmp = (*in32 >> 29); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 18)) << (21 - 18); - *(out + 9) = base + tmp; - tmp = (*in32 >> 18); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 7)) << (21 - 7); - *(out + 10) = base + tmp; - *(out + 11) = base + ((*in32 >> 7) & 2097151); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 17)) << (21 - 17); - *(out + 12) = base + tmp; - tmp = (*in32 >> 17); - in32++; - /* consumed: 4 bytes (total: 40) */ - tmp |= (*in32 % (1U << 6)) << (21 - 6); - *(out + 13) = base + tmp; - *(out + 14) = base + ((*in32 >> 6) & 2097151); - tmp = (*in32 >> 27); - in32++; - /* consumed: 4 bytes (total: 44) */ - tmp |= (*in32 % (1U << 16)) << (21 - 16); - *(out + 15) = base + tmp; - /* remaining: 16 bits */ - return 42; -} - -static uint32_t -pack22_16(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (22 - 12); - tmp |= (*(in + 2) - base) << 12; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (22 - 2); - tmp |= (*(in + 3) - base) << 2; - tmp |= (*(in + 4) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 4) - base) >> (22 - 14); - tmp |= (*(in + 5) - base) << 14; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 5) - base) >> (22 - 4); - tmp |= (*(in + 6) - base) << 4; - tmp |= (*(in + 7) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 7) - base) >> (22 - 16); - tmp |= (*(in + 8) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 8) - base) >> (22 - 6); - tmp |= (*(in + 9) - base) << 6; - tmp |= (*(in + 10) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 10) - base) >> (22 - 18); - tmp |= (*(in + 11) - base) << 18; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 11) - base) >> (22 - 8); - tmp |= (*(in + 12) - base) << 8; - tmp |= (*(in + 13) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 13) - base) >> (22 - 20); - tmp |= (*(in + 14) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 14) - base) >> (22 - 10); - tmp |= (*(in + 15) - base) << 10; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 44) */ - memcpy(out, &tmp, length); - return 44; -} - -static uint32_t -unpack22_16(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 4194303); - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 12)) << (22 - 12); - *(out + 1) = base + tmp; - tmp = (*in32 >> 12); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 2)) << (22 - 2); - *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 2) & 4194303); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 14)) << (22 - 14); - *(out + 4) = base + tmp; - tmp = (*in32 >> 14); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 4)) << (22 - 4); - *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 4) & 4194303); - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 16)) << (22 - 16); - *(out + 7) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 6)) << (22 - 6); - *(out + 8) = base + tmp; - *(out + 9) = base + ((*in32 >> 6) & 4194303); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 18)) << (22 - 18); - *(out + 10) = base + tmp; - tmp = (*in32 >> 18); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 8)) << (22 - 8); - *(out + 11) = base + tmp; - *(out + 12) = base + ((*in32 >> 8) & 4194303); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 40) */ - tmp |= (*in32 % (1U << 20)) << (22 - 20); - *(out + 13) = base + tmp; - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 44) */ - tmp |= (*in32 % (1U << 10)) << (22 - 10); - *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 10) & 4194303); - /* remaining: 0 bits */ - return 44; -} - -static uint32_t -pack23_16(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 23; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (23 - 14); - tmp |= (*(in + 2) - base) << 14; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (23 - 5); - tmp |= (*(in + 3) - base) << 5; - tmp |= (*(in + 4) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 4) - base) >> (23 - 19); - tmp |= (*(in + 5) - base) << 19; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 5) - base) >> (23 - 10); - tmp |= (*(in + 6) - base) << 10; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 6) - base) >> (23 - 1); - tmp |= (*(in + 7) - base) << 1; - tmp |= (*(in + 8) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 8) - base) >> (23 - 15); - tmp |= (*(in + 9) - base) << 15; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 9) - base) >> (23 - 6); - tmp |= (*(in + 10) - base) << 6; - tmp |= (*(in + 11) - base) << 29; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 11) - base) >> (23 - 20); - tmp |= (*(in + 12) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 12) - base) >> (23 - 11); - tmp |= (*(in + 13) - base) << 11; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 13) - base) >> (23 - 2); - tmp |= (*(in + 14) - base) << 2; - tmp |= (*(in + 15) - base) << 25; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 15) - base) >> (23 - 16); - /* remaining: 16 bits */ - length = (32 / 8) - (32 - 16) / 8; - /* consumed: 2 bytes (total: 46) */ - memcpy(out, &tmp, length); - return 46; -} - -static uint32_t -unpack23_16(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 8388607); - tmp = (*in32 >> 23); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 14)) << (23 - 14); - *(out + 1) = base + tmp; - tmp = (*in32 >> 14); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 5)) << (23 - 5); - *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 5) & 8388607); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 19)) << (23 - 19); - *(out + 4) = base + tmp; - tmp = (*in32 >> 19); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 10)) << (23 - 10); - *(out + 5) = base + tmp; - tmp = (*in32 >> 10); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 1)) << (23 - 1); - *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 1) & 8388607); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 15)) << (23 - 15); - *(out + 8) = base + tmp; - tmp = (*in32 >> 15); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 6)) << (23 - 6); - *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 6) & 8388607); - tmp = (*in32 >> 29); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 20)) << (23 - 20); - *(out + 11) = base + tmp; - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 40) */ - tmp |= (*in32 % (1U << 11)) << (23 - 11); - *(out + 12) = base + tmp; - tmp = (*in32 >> 11); - in32++; - /* consumed: 4 bytes (total: 44) */ - tmp |= (*in32 % (1U << 2)) << (23 - 2); - *(out + 13) = base + tmp; - *(out + 14) = base + ((*in32 >> 2) & 8388607); - tmp = (*in32 >> 25); - in32++; - /* consumed: 4 bytes (total: 48) */ - tmp |= (*in32 % (1U << 16)) << (23 - 16); - *(out + 15) = base + tmp; - /* remaining: 16 bits */ - return 46; -} - -static uint32_t -pack24_16(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (24 - 16); - tmp |= (*(in + 2) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (24 - 8); - tmp |= (*(in + 3) - base) << 8; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 4) - base) << 0; - tmp |= (*(in + 5) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 5) - base) >> (24 - 16); - tmp |= (*(in + 6) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 6) - base) >> (24 - 8); - tmp |= (*(in + 7) - base) << 8; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 8) - base) << 0; - tmp |= (*(in + 9) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 9) - base) >> (24 - 16); - tmp |= (*(in + 10) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 10) - base) >> (24 - 8); - tmp |= (*(in + 11) - base) << 8; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 12) - base) << 0; - tmp |= (*(in + 13) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 13) - base) >> (24 - 16); - tmp |= (*(in + 14) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 14) - base) >> (24 - 8); - tmp |= (*(in + 15) - base) << 8; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 48) */ - memcpy(out, &tmp, length); - return 48; -} - -static uint32_t -unpack24_16(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 16777215); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 16)) << (24 - 16); - *(out + 1) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 8)) << (24 - 8); - *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 8) & 16777215); - in32++; - /* consumed: 4 bytes (total: 16) */ - *(out + 4) = base + ((*in32 >> 0) & 16777215); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 16)) << (24 - 16); - *(out + 5) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 8)) << (24 - 8); - *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 8) & 16777215); - in32++; - /* consumed: 4 bytes (total: 28) */ - *(out + 8) = base + ((*in32 >> 0) & 16777215); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 16)) << (24 - 16); - *(out + 9) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 8)) << (24 - 8); - *(out + 10) = base + tmp; - *(out + 11) = base + ((*in32 >> 8) & 16777215); - in32++; - /* consumed: 4 bytes (total: 40) */ - *(out + 12) = base + ((*in32 >> 0) & 16777215); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 44) */ - tmp |= (*in32 % (1U << 16)) << (24 - 16); - *(out + 13) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 48) */ - tmp |= (*in32 % (1U << 8)) << (24 - 8); - *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 8) & 16777215); - /* remaining: 0 bits */ - return 48; -} - -static uint32_t -pack25_16(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 25; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (25 - 18); - tmp |= (*(in + 2) - base) << 18; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (25 - 11); - tmp |= (*(in + 3) - base) << 11; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (25 - 4); - tmp |= (*(in + 4) - base) << 4; - tmp |= (*(in + 5) - base) << 29; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 5) - base) >> (25 - 22); - tmp |= (*(in + 6) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 6) - base) >> (25 - 15); - tmp |= (*(in + 7) - base) << 15; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 7) - base) >> (25 - 8); - tmp |= (*(in + 8) - base) << 8; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 8) - base) >> (25 - 1); - tmp |= (*(in + 9) - base) << 1; - tmp |= (*(in + 10) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 10) - base) >> (25 - 19); - tmp |= (*(in + 11) - base) << 19; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 11) - base) >> (25 - 12); - tmp |= (*(in + 12) - base) << 12; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 12) - base) >> (25 - 5); - tmp |= (*(in + 13) - base) << 5; - tmp |= (*(in + 14) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 14) - base) >> (25 - 23); - tmp |= (*(in + 15) - base) << 23; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 15) - base) >> (25 - 16); - /* remaining: 16 bits */ - length = (32 / 8) - (32 - 16) / 8; - /* consumed: 2 bytes (total: 50) */ - memcpy(out, &tmp, length); - return 50; -} - -static uint32_t -unpack25_16(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 33554431); - tmp = (*in32 >> 25); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 18)) << (25 - 18); - *(out + 1) = base + tmp; - tmp = (*in32 >> 18); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 11)) << (25 - 11); - *(out + 2) = base + tmp; - tmp = (*in32 >> 11); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 4)) << (25 - 4); - *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 4) & 33554431); - tmp = (*in32 >> 29); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 22)) << (25 - 22); - *(out + 5) = base + tmp; - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 15)) << (25 - 15); - *(out + 6) = base + tmp; - tmp = (*in32 >> 15); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 8)) << (25 - 8); - *(out + 7) = base + tmp; - tmp = (*in32 >> 8); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 1)) << (25 - 1); - *(out + 8) = base + tmp; - *(out + 9) = base + ((*in32 >> 1) & 33554431); - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 19)) << (25 - 19); - *(out + 10) = base + tmp; - tmp = (*in32 >> 19); - in32++; - /* consumed: 4 bytes (total: 40) */ - tmp |= (*in32 % (1U << 12)) << (25 - 12); - *(out + 11) = base + tmp; - tmp = (*in32 >> 12); - in32++; - /* consumed: 4 bytes (total: 44) */ - tmp |= (*in32 % (1U << 5)) << (25 - 5); - *(out + 12) = base + tmp; - *(out + 13) = base + ((*in32 >> 5) & 33554431); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 48) */ - tmp |= (*in32 % (1U << 23)) << (25 - 23); - *(out + 14) = base + tmp; - tmp = (*in32 >> 23); - in32++; - /* consumed: 4 bytes (total: 52) */ - tmp |= (*in32 % (1U << 16)) << (25 - 16); - *(out + 15) = base + tmp; - /* remaining: 16 bits */ - return 50; -} - -static uint32_t -pack26_16(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (26 - 20); - tmp |= (*(in + 2) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (26 - 14); - tmp |= (*(in + 3) - base) << 14; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (26 - 8); - tmp |= (*(in + 4) - base) << 8; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (26 - 2); - tmp |= (*(in + 5) - base) << 2; - tmp |= (*(in + 6) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 6) - base) >> (26 - 22); - tmp |= (*(in + 7) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 7) - base) >> (26 - 16); - tmp |= (*(in + 8) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 8) - base) >> (26 - 10); - tmp |= (*(in + 9) - base) << 10; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 9) - base) >> (26 - 4); - tmp |= (*(in + 10) - base) << 4; - tmp |= (*(in + 11) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 11) - base) >> (26 - 24); - tmp |= (*(in + 12) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 12) - base) >> (26 - 18); - tmp |= (*(in + 13) - base) << 18; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 13) - base) >> (26 - 12); - tmp |= (*(in + 14) - base) << 12; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 14) - base) >> (26 - 6); - tmp |= (*(in + 15) - base) << 6; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 52) */ - memcpy(out, &tmp, length); - return 52; -} - -static uint32_t -unpack26_16(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 67108863); - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 20)) << (26 - 20); - *(out + 1) = base + tmp; - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 14)) << (26 - 14); - *(out + 2) = base + tmp; - tmp = (*in32 >> 14); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 8)) << (26 - 8); - *(out + 3) = base + tmp; - tmp = (*in32 >> 8); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 2)) << (26 - 2); - *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 2) & 67108863); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 22)) << (26 - 22); - *(out + 6) = base + tmp; - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 16)) << (26 - 16); - *(out + 7) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 10)) << (26 - 10); - *(out + 8) = base + tmp; - tmp = (*in32 >> 10); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 4)) << (26 - 4); - *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 4) & 67108863); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 40) */ - tmp |= (*in32 % (1U << 24)) << (26 - 24); - *(out + 11) = base + tmp; - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 44) */ - tmp |= (*in32 % (1U << 18)) << (26 - 18); - *(out + 12) = base + tmp; - tmp = (*in32 >> 18); - in32++; - /* consumed: 4 bytes (total: 48) */ - tmp |= (*in32 % (1U << 12)) << (26 - 12); - *(out + 13) = base + tmp; - tmp = (*in32 >> 12); - in32++; - /* consumed: 4 bytes (total: 52) */ - tmp |= (*in32 % (1U << 6)) << (26 - 6); - *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 6) & 67108863); - /* remaining: 0 bits */ - return 52; -} - -static uint32_t -pack27_16(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 27; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (27 - 22); - tmp |= (*(in + 2) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (27 - 17); - tmp |= (*(in + 3) - base) << 17; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (27 - 12); - tmp |= (*(in + 4) - base) << 12; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (27 - 7); - tmp |= (*(in + 5) - base) << 7; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 5) - base) >> (27 - 2); - tmp |= (*(in + 6) - base) << 2; - tmp |= (*(in + 7) - base) << 29; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 7) - base) >> (27 - 24); - tmp |= (*(in + 8) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 8) - base) >> (27 - 19); - tmp |= (*(in + 9) - base) << 19; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 9) - base) >> (27 - 14); - tmp |= (*(in + 10) - base) << 14; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 10) - base) >> (27 - 9); - tmp |= (*(in + 11) - base) << 9; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 11) - base) >> (27 - 4); - tmp |= (*(in + 12) - base) << 4; - tmp |= (*(in + 13) - base) << 31; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 13) - base) >> (27 - 26); - tmp |= (*(in + 14) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 14) - base) >> (27 - 21); - tmp |= (*(in + 15) - base) << 21; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 15) - base) >> (27 - 16); - /* remaining: 16 bits */ - length = (32 / 8) - (32 - 16) / 8; - /* consumed: 2 bytes (total: 54) */ - memcpy(out, &tmp, length); - return 54; -} - -static uint32_t -unpack27_16(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 134217727); - tmp = (*in32 >> 27); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 22)) << (27 - 22); - *(out + 1) = base + tmp; - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 17)) << (27 - 17); - *(out + 2) = base + tmp; - tmp = (*in32 >> 17); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 12)) << (27 - 12); - *(out + 3) = base + tmp; - tmp = (*in32 >> 12); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 7)) << (27 - 7); - *(out + 4) = base + tmp; - tmp = (*in32 >> 7); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 2)) << (27 - 2); - *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 2) & 134217727); - tmp = (*in32 >> 29); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 24)) << (27 - 24); - *(out + 7) = base + tmp; - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 19)) << (27 - 19); - *(out + 8) = base + tmp; - tmp = (*in32 >> 19); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 14)) << (27 - 14); - *(out + 9) = base + tmp; - tmp = (*in32 >> 14); - in32++; - /* consumed: 4 bytes (total: 40) */ - tmp |= (*in32 % (1U << 9)) << (27 - 9); - *(out + 10) = base + tmp; - tmp = (*in32 >> 9); - in32++; - /* consumed: 4 bytes (total: 44) */ - tmp |= (*in32 % (1U << 4)) << (27 - 4); - *(out + 11) = base + tmp; - *(out + 12) = base + ((*in32 >> 4) & 134217727); - tmp = (*in32 >> 31); - in32++; - /* consumed: 4 bytes (total: 48) */ - tmp |= (*in32 % (1U << 26)) << (27 - 26); - *(out + 13) = base + tmp; - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 52) */ - tmp |= (*in32 % (1U << 21)) << (27 - 21); - *(out + 14) = base + tmp; - tmp = (*in32 >> 21); - in32++; - /* consumed: 4 bytes (total: 56) */ - tmp |= (*in32 % (1U << 16)) << (27 - 16); - *(out + 15) = base + tmp; - /* remaining: 16 bits */ - return 54; -} - -static uint32_t -pack28_16(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (28 - 24); - tmp |= (*(in + 2) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (28 - 20); - tmp |= (*(in + 3) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (28 - 16); - tmp |= (*(in + 4) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (28 - 12); - tmp |= (*(in + 5) - base) << 12; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 5) - base) >> (28 - 8); - tmp |= (*(in + 6) - base) << 8; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 6) - base) >> (28 - 4); - tmp |= (*(in + 7) - base) << 4; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 8) - base) << 0; - tmp |= (*(in + 9) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 9) - base) >> (28 - 24); - tmp |= (*(in + 10) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 10) - base) >> (28 - 20); - tmp |= (*(in + 11) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 11) - base) >> (28 - 16); - tmp |= (*(in + 12) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 12) - base) >> (28 - 12); - tmp |= (*(in + 13) - base) << 12; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 13) - base) >> (28 - 8); - tmp |= (*(in + 14) - base) << 8; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 14) - base) >> (28 - 4); - tmp |= (*(in + 15) - base) << 4; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 56) */ - memcpy(out, &tmp, length); - return 56; -} - -static uint32_t -unpack28_16(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 268435455); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 24)) << (28 - 24); - *(out + 1) = base + tmp; - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 20)) << (28 - 20); - *(out + 2) = base + tmp; - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 16)) << (28 - 16); - *(out + 3) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 12)) << (28 - 12); - *(out + 4) = base + tmp; - tmp = (*in32 >> 12); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 8)) << (28 - 8); - *(out + 5) = base + tmp; - tmp = (*in32 >> 8); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 4)) << (28 - 4); - *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 4) & 268435455); - in32++; - /* consumed: 4 bytes (total: 32) */ - *(out + 8) = base + ((*in32 >> 0) & 268435455); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 24)) << (28 - 24); - *(out + 9) = base + tmp; - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 40) */ - tmp |= (*in32 % (1U << 20)) << (28 - 20); - *(out + 10) = base + tmp; - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 44) */ - tmp |= (*in32 % (1U << 16)) << (28 - 16); - *(out + 11) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 48) */ - tmp |= (*in32 % (1U << 12)) << (28 - 12); - *(out + 12) = base + tmp; - tmp = (*in32 >> 12); - in32++; - /* consumed: 4 bytes (total: 52) */ - tmp |= (*in32 % (1U << 8)) << (28 - 8); - *(out + 13) = base + tmp; - tmp = (*in32 >> 8); - in32++; - /* consumed: 4 bytes (total: 56) */ - tmp |= (*in32 % (1U << 4)) << (28 - 4); - *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 4) & 268435455); - /* remaining: 0 bits */ - return 56; -} - -static uint32_t -pack29_16(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 29; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (29 - 26); - tmp |= (*(in + 2) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (29 - 23); - tmp |= (*(in + 3) - base) << 23; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (29 - 20); - tmp |= (*(in + 4) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (29 - 17); - tmp |= (*(in + 5) - base) << 17; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 5) - base) >> (29 - 14); - tmp |= (*(in + 6) - base) << 14; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 6) - base) >> (29 - 11); - tmp |= (*(in + 7) - base) << 11; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 7) - base) >> (29 - 8); - tmp |= (*(in + 8) - base) << 8; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 8) - base) >> (29 - 5); - tmp |= (*(in + 9) - base) << 5; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 9) - base) >> (29 - 2); - tmp |= (*(in + 10) - base) << 2; - tmp |= (*(in + 11) - base) << 31; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 11) - base) >> (29 - 28); - tmp |= (*(in + 12) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 12) - base) >> (29 - 25); - tmp |= (*(in + 13) - base) << 25; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 13) - base) >> (29 - 22); - tmp |= (*(in + 14) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 14) - base) >> (29 - 19); - tmp |= (*(in + 15) - base) << 19; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 15) - base) >> (29 - 16); - /* remaining: 16 bits */ - length = (32 / 8) - (32 - 16) / 8; - /* consumed: 2 bytes (total: 58) */ - memcpy(out, &tmp, length); - return 58; -} - -static uint32_t -unpack29_16(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 536870911); - tmp = (*in32 >> 29); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 26)) << (29 - 26); - *(out + 1) = base + tmp; - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 23)) << (29 - 23); - *(out + 2) = base + tmp; - tmp = (*in32 >> 23); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 20)) << (29 - 20); - *(out + 3) = base + tmp; - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 17)) << (29 - 17); - *(out + 4) = base + tmp; - tmp = (*in32 >> 17); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 14)) << (29 - 14); - *(out + 5) = base + tmp; - tmp = (*in32 >> 14); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 11)) << (29 - 11); - *(out + 6) = base + tmp; - tmp = (*in32 >> 11); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 8)) << (29 - 8); - *(out + 7) = base + tmp; - tmp = (*in32 >> 8); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 5)) << (29 - 5); - *(out + 8) = base + tmp; - tmp = (*in32 >> 5); - in32++; - /* consumed: 4 bytes (total: 40) */ - tmp |= (*in32 % (1U << 2)) << (29 - 2); - *(out + 9) = base + tmp; - *(out + 10) = base + ((*in32 >> 2) & 536870911); - tmp = (*in32 >> 31); - in32++; - /* consumed: 4 bytes (total: 44) */ - tmp |= (*in32 % (1U << 28)) << (29 - 28); - *(out + 11) = base + tmp; - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 48) */ - tmp |= (*in32 % (1U << 25)) << (29 - 25); - *(out + 12) = base + tmp; - tmp = (*in32 >> 25); - in32++; - /* consumed: 4 bytes (total: 52) */ - tmp |= (*in32 % (1U << 22)) << (29 - 22); - *(out + 13) = base + tmp; - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 56) */ - tmp |= (*in32 % (1U << 19)) << (29 - 19); - *(out + 14) = base + tmp; - tmp = (*in32 >> 19); - in32++; - /* consumed: 4 bytes (total: 60) */ - tmp |= (*in32 % (1U << 16)) << (29 - 16); - *(out + 15) = base + tmp; - /* remaining: 16 bits */ - return 58; -} - -static uint32_t -pack30_16(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (30 - 28); - tmp |= (*(in + 2) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (30 - 26); - tmp |= (*(in + 3) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (30 - 24); - tmp |= (*(in + 4) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (30 - 22); - tmp |= (*(in + 5) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 5) - base) >> (30 - 20); - tmp |= (*(in + 6) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 6) - base) >> (30 - 18); - tmp |= (*(in + 7) - base) << 18; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 7) - base) >> (30 - 16); - tmp |= (*(in + 8) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 8) - base) >> (30 - 14); - tmp |= (*(in + 9) - base) << 14; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 9) - base) >> (30 - 12); - tmp |= (*(in + 10) - base) << 12; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 10) - base) >> (30 - 10); - tmp |= (*(in + 11) - base) << 10; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 11) - base) >> (30 - 8); - tmp |= (*(in + 12) - base) << 8; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 12) - base) >> (30 - 6); - tmp |= (*(in + 13) - base) << 6; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 13) - base) >> (30 - 4); - tmp |= (*(in + 14) - base) << 4; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 14) - base) >> (30 - 2); - tmp |= (*(in + 15) - base) << 2; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 60) */ - memcpy(out, &tmp, length); - return 60; -} - -static uint32_t -unpack30_16(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 1073741823); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 28)) << (30 - 28); - *(out + 1) = base + tmp; - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 26)) << (30 - 26); - *(out + 2) = base + tmp; - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 24)) << (30 - 24); - *(out + 3) = base + tmp; - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 22)) << (30 - 22); - *(out + 4) = base + tmp; - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 20)) << (30 - 20); - *(out + 5) = base + tmp; - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 18)) << (30 - 18); - *(out + 6) = base + tmp; - tmp = (*in32 >> 18); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 16)) << (30 - 16); - *(out + 7) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 14)) << (30 - 14); - *(out + 8) = base + tmp; - tmp = (*in32 >> 14); - in32++; - /* consumed: 4 bytes (total: 40) */ - tmp |= (*in32 % (1U << 12)) << (30 - 12); - *(out + 9) = base + tmp; - tmp = (*in32 >> 12); - in32++; - /* consumed: 4 bytes (total: 44) */ - tmp |= (*in32 % (1U << 10)) << (30 - 10); - *(out + 10) = base + tmp; - tmp = (*in32 >> 10); - in32++; - /* consumed: 4 bytes (total: 48) */ - tmp |= (*in32 % (1U << 8)) << (30 - 8); - *(out + 11) = base + tmp; - tmp = (*in32 >> 8); - in32++; - /* consumed: 4 bytes (total: 52) */ - tmp |= (*in32 % (1U << 6)) << (30 - 6); - *(out + 12) = base + tmp; - tmp = (*in32 >> 6); - in32++; - /* consumed: 4 bytes (total: 56) */ - tmp |= (*in32 % (1U << 4)) << (30 - 4); - *(out + 13) = base + tmp; - tmp = (*in32 >> 4); - in32++; - /* consumed: 4 bytes (total: 60) */ - tmp |= (*in32 % (1U << 2)) << (30 - 2); - *(out + 14) = base + tmp; - *(out + 15) = base + ((*in32 >> 2) & 1073741823); - /* remaining: 0 bits */ - return 60; -} - -static uint32_t -pack31_16(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 31; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (31 - 30); - tmp |= (*(in + 2) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (31 - 29); - tmp |= (*(in + 3) - base) << 29; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (31 - 28); - tmp |= (*(in + 4) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (31 - 27); - tmp |= (*(in + 5) - base) << 27; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 5) - base) >> (31 - 26); - tmp |= (*(in + 6) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 6) - base) >> (31 - 25); - tmp |= (*(in + 7) - base) << 25; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 7) - base) >> (31 - 24); - tmp |= (*(in + 8) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = (*(in + 8) - base) >> (31 - 23); - tmp |= (*(in + 9) - base) << 23; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = (*(in + 9) - base) >> (31 - 22); - tmp |= (*(in + 10) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = (*(in + 10) - base) >> (31 - 21); - tmp |= (*(in + 11) - base) << 21; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = (*(in + 11) - base) >> (31 - 20); - tmp |= (*(in + 12) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = (*(in + 12) - base) >> (31 - 19); - tmp |= (*(in + 13) - base) << 19; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = (*(in + 13) - base) >> (31 - 18); - tmp |= (*(in + 14) - base) << 18; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = (*(in + 14) - base) >> (31 - 17); - tmp |= (*(in + 15) - base) << 17; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 60) */ - tmp = (*(in + 15) - base) >> (31 - 16); - /* remaining: 16 bits */ - length = (32 / 8) - (32 - 16) / 8; - /* consumed: 2 bytes (total: 62) */ - memcpy(out, &tmp, length); - return 62; -} - -static uint32_t -unpack31_16(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 2147483647); - tmp = (*in32 >> 31); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 30)) << (31 - 30); - *(out + 1) = base + tmp; - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 29)) << (31 - 29); - *(out + 2) = base + tmp; - tmp = (*in32 >> 29); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 28)) << (31 - 28); - *(out + 3) = base + tmp; - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 27)) << (31 - 27); - *(out + 4) = base + tmp; - tmp = (*in32 >> 27); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 26)) << (31 - 26); - *(out + 5) = base + tmp; - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 25)) << (31 - 25); - *(out + 6) = base + tmp; - tmp = (*in32 >> 25); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 24)) << (31 - 24); - *(out + 7) = base + tmp; - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 36) */ - tmp |= (*in32 % (1U << 23)) << (31 - 23); - *(out + 8) = base + tmp; - tmp = (*in32 >> 23); - in32++; - /* consumed: 4 bytes (total: 40) */ - tmp |= (*in32 % (1U << 22)) << (31 - 22); - *(out + 9) = base + tmp; - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 44) */ - tmp |= (*in32 % (1U << 21)) << (31 - 21); - *(out + 10) = base + tmp; - tmp = (*in32 >> 21); - in32++; - /* consumed: 4 bytes (total: 48) */ - tmp |= (*in32 % (1U << 20)) << (31 - 20); - *(out + 11) = base + tmp; - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 52) */ - tmp |= (*in32 % (1U << 19)) << (31 - 19); - *(out + 12) = base + tmp; - tmp = (*in32 >> 19); - in32++; - /* consumed: 4 bytes (total: 56) */ - tmp |= (*in32 % (1U << 18)) << (31 - 18); - *(out + 13) = base + tmp; - tmp = (*in32 >> 18); - in32++; - /* consumed: 4 bytes (total: 60) */ - tmp |= (*in32 % (1U << 17)) << (31 - 17); - *(out + 14) = base + tmp; - tmp = (*in32 >> 17); - in32++; - /* consumed: 4 bytes (total: 64) */ - tmp |= (*in32 % (1U << 16)) << (31 - 16); - *(out + 15) = base + tmp; - /* remaining: 16 bits */ - return 62; -} - -static uint32_t -pack32_16(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t i; - uint32_t *out32 = (uint32_t *)out; - for (i = 0; i < 16; i++) - out32[i] = in[i] - base; - return 16 * sizeof(uint32_t); -} - -static uint32_t -unpack32_16(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t i; - uint32_t *in32 = (uint32_t *)in; - for (i = 0; i < 16; i++) - out[i] = base + in32[i]; - return 16 * sizeof(uint32_t); -} - -for_packfunc_t for_pack16[33] = { - pack0_n, - pack1_16, - pack2_16, - pack3_16, - pack4_16, - pack5_16, - pack6_16, - pack7_16, - pack8_16, - pack9_16, - pack10_16, - pack11_16, - pack12_16, - pack13_16, - pack14_16, - pack15_16, - pack16_16, - pack17_16, - pack18_16, - pack19_16, - pack20_16, - pack21_16, - pack22_16, - pack23_16, - pack24_16, - pack25_16, - pack26_16, - pack27_16, - pack28_16, - pack29_16, - pack30_16, - pack31_16, - pack32_16 -}; - -for_unpackfunc_t for_unpack16[33] = { - unpack0_n, - unpack1_16, - unpack2_16, - unpack3_16, - unpack4_16, - unpack5_16, - unpack6_16, - unpack7_16, - unpack8_16, - unpack9_16, - unpack10_16, - unpack11_16, - unpack12_16, - unpack13_16, - unpack14_16, - unpack15_16, - unpack16_16, - unpack17_16, - unpack18_16, - unpack19_16, - unpack20_16, - unpack21_16, - unpack22_16, - unpack23_16, - unpack24_16, - unpack25_16, - unpack26_16, - unpack27_16, - unpack28_16, - unpack29_16, - unpack30_16, - unpack31_16, - unpack32_16 -}; - -static uint32_t -pack1_8(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 1; - tmp |= (*(in + 2) - base) << 2; - tmp |= (*(in + 3) - base) << 3; - tmp |= (*(in + 4) - base) << 4; - tmp |= (*(in + 5) - base) << 5; - tmp |= (*(in + 6) - base) << 6; - tmp |= (*(in + 7) - base) << 7; - /* remaining: 24 bits */ - length = (32 / 8) - (32 - 8) / 8; - /* consumed: 1 bytes (total: 1) */ - memcpy(out, &tmp, length); - return 1; -} - -static uint32_t -unpack1_8(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 1); - *(out + 1) = base + ((*in32 >> 1) & 1); - *(out + 2) = base + ((*in32 >> 2) & 1); - *(out + 3) = base + ((*in32 >> 3) & 1); - *(out + 4) = base + ((*in32 >> 4) & 1); - *(out + 5) = base + ((*in32 >> 5) & 1); - *(out + 6) = base + ((*in32 >> 6) & 1); - *(out + 7) = base + ((*in32 >> 7) & 1); - /* remaining: 24 bits */ - return 1; -} - -static uint32_t -pack2_8(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 2; - tmp |= (*(in + 2) - base) << 4; - tmp |= (*(in + 3) - base) << 6; - tmp |= (*(in + 4) - base) << 8; - tmp |= (*(in + 5) - base) << 10; - tmp |= (*(in + 6) - base) << 12; - tmp |= (*(in + 7) - base) << 14; - /* remaining: 16 bits */ - length = (32 / 8) - (32 - 16) / 8; - /* consumed: 2 bytes (total: 2) */ - memcpy(out, &tmp, length); - return 2; -} - -static uint32_t -unpack2_8(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 3); - *(out + 1) = base + ((*in32 >> 2) & 3); - *(out + 2) = base + ((*in32 >> 4) & 3); - *(out + 3) = base + ((*in32 >> 6) & 3); - *(out + 4) = base + ((*in32 >> 8) & 3); - *(out + 5) = base + ((*in32 >> 10) & 3); - *(out + 6) = base + ((*in32 >> 12) & 3); - *(out + 7) = base + ((*in32 >> 14) & 3); - /* remaining: 16 bits */ - return 2; -} - -static uint32_t -pack3_8(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 3; - tmp |= (*(in + 2) - base) << 6; - tmp |= (*(in + 3) - base) << 9; - tmp |= (*(in + 4) - base) << 12; - tmp |= (*(in + 5) - base) << 15; - tmp |= (*(in + 6) - base) << 18; - tmp |= (*(in + 7) - base) << 21; - /* remaining: 8 bits */ - length = (32 / 8) - (32 - 24) / 8; - /* consumed: 3 bytes (total: 3) */ - memcpy(out, &tmp, length); - return 3; -} - -static uint32_t -unpack3_8(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 7); - *(out + 1) = base + ((*in32 >> 3) & 7); - *(out + 2) = base + ((*in32 >> 6) & 7); - *(out + 3) = base + ((*in32 >> 9) & 7); - *(out + 4) = base + ((*in32 >> 12) & 7); - *(out + 5) = base + ((*in32 >> 15) & 7); - *(out + 6) = base + ((*in32 >> 18) & 7); - *(out + 7) = base + ((*in32 >> 21) & 7); - /* remaining: 8 bits */ - return 3; -} - -static uint32_t -pack4_8(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 4; - tmp |= (*(in + 2) - base) << 8; - tmp |= (*(in + 3) - base) << 12; - tmp |= (*(in + 4) - base) << 16; - tmp |= (*(in + 5) - base) << 20; - tmp |= (*(in + 6) - base) << 24; - tmp |= (*(in + 7) - base) << 28; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 4) */ - memcpy(out, &tmp, length); - return 4; -} - -static uint32_t -unpack4_8(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 15); - *(out + 1) = base + ((*in32 >> 4) & 15); - *(out + 2) = base + ((*in32 >> 8) & 15); - *(out + 3) = base + ((*in32 >> 12) & 15); - *(out + 4) = base + ((*in32 >> 16) & 15); - *(out + 5) = base + ((*in32 >> 20) & 15); - *(out + 6) = base + ((*in32 >> 24) & 15); - *(out + 7) = base + ((*in32 >> 28) & 15); - /* remaining: 0 bits */ - return 4; -} - -static uint32_t -pack5_8(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 5; - tmp |= (*(in + 2) - base) << 10; - tmp |= (*(in + 3) - base) << 15; - tmp |= (*(in + 4) - base) << 20; - tmp |= (*(in + 5) - base) << 25; - tmp |= (*(in + 6) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 6) - base) >> (5 - 3); - tmp |= (*(in + 7) - base) << 3; - /* remaining: 24 bits */ - length = (32 / 8) - (32 - 8) / 8; - /* consumed: 1 bytes (total: 5) */ - memcpy(out, &tmp, length); - return 5; -} - -static uint32_t -unpack5_8(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 31); - *(out + 1) = base + ((*in32 >> 5) & 31); - *(out + 2) = base + ((*in32 >> 10) & 31); - *(out + 3) = base + ((*in32 >> 15) & 31); - *(out + 4) = base + ((*in32 >> 20) & 31); - *(out + 5) = base + ((*in32 >> 25) & 31); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 3)) << (5 - 3); - *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 3) & 31); - /* remaining: 24 bits */ - return 5; -} - -static uint32_t -pack6_8(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 6; - tmp |= (*(in + 2) - base) << 12; - tmp |= (*(in + 3) - base) << 18; - tmp |= (*(in + 4) - base) << 24; - tmp |= (*(in + 5) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 5) - base) >> (6 - 4); - tmp |= (*(in + 6) - base) << 4; - tmp |= (*(in + 7) - base) << 10; - /* remaining: 16 bits */ - length = (32 / 8) - (32 - 16) / 8; - /* consumed: 2 bytes (total: 6) */ - memcpy(out, &tmp, length); - return 6; -} - -static uint32_t -unpack6_8(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 63); - *(out + 1) = base + ((*in32 >> 6) & 63); - *(out + 2) = base + ((*in32 >> 12) & 63); - *(out + 3) = base + ((*in32 >> 18) & 63); - *(out + 4) = base + ((*in32 >> 24) & 63); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 4)) << (6 - 4); - *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 4) & 63); - *(out + 7) = base + ((*in32 >> 10) & 63); - /* remaining: 16 bits */ - return 6; -} - -static uint32_t -pack7_8(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 7; - tmp |= (*(in + 2) - base) << 14; - tmp |= (*(in + 3) - base) << 21; - tmp |= (*(in + 4) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 4) - base) >> (7 - 3); - tmp |= (*(in + 5) - base) << 3; - tmp |= (*(in + 6) - base) << 10; - tmp |= (*(in + 7) - base) << 17; - /* remaining: 8 bits */ - length = (32 / 8) - (32 - 24) / 8; - /* consumed: 3 bytes (total: 7) */ - memcpy(out, &tmp, length); - return 7; -} - -static uint32_t -unpack7_8(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 127); - *(out + 1) = base + ((*in32 >> 7) & 127); - *(out + 2) = base + ((*in32 >> 14) & 127); - *(out + 3) = base + ((*in32 >> 21) & 127); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 3)) << (7 - 3); - *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 3) & 127); - *(out + 6) = base + ((*in32 >> 10) & 127); - *(out + 7) = base + ((*in32 >> 17) & 127); - /* remaining: 8 bits */ - return 7; -} - -static uint32_t -pack8_8(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 8; - tmp |= (*(in + 2) - base) << 16; - tmp |= (*(in + 3) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 4) - base) << 0; - tmp |= (*(in + 5) - base) << 8; - tmp |= (*(in + 6) - base) << 16; - tmp |= (*(in + 7) - base) << 24; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 8) */ - memcpy(out, &tmp, length); - return 8; -} - -static uint32_t -unpack8_8(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 255); - *(out + 1) = base + ((*in32 >> 8) & 255); - *(out + 2) = base + ((*in32 >> 16) & 255); - *(out + 3) = base + ((*in32 >> 24) & 255); - in32++; - /* consumed: 4 bytes (total: 8) */ - *(out + 4) = base + ((*in32 >> 0) & 255); - *(out + 5) = base + ((*in32 >> 8) & 255); - *(out + 6) = base + ((*in32 >> 16) & 255); - *(out + 7) = base + ((*in32 >> 24) & 255); - /* remaining: 0 bits */ - return 8; -} - -static uint32_t -pack9_8(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 9; - tmp |= (*(in + 2) - base) << 18; - tmp |= (*(in + 3) - base) << 27; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 3) - base) >> (9 - 4); - tmp |= (*(in + 4) - base) << 4; - tmp |= (*(in + 5) - base) << 13; - tmp |= (*(in + 6) - base) << 22; - tmp |= (*(in + 7) - base) << 31; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 7) - base) >> (9 - 8); - /* remaining: 24 bits */ - length = (32 / 8) - (32 - 8) / 8; - /* consumed: 1 bytes (total: 9) */ - memcpy(out, &tmp, length); - return 9; -} - -static uint32_t -unpack9_8(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 511); - *(out + 1) = base + ((*in32 >> 9) & 511); - *(out + 2) = base + ((*in32 >> 18) & 511); - tmp = (*in32 >> 27); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 4)) << (9 - 4); - *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 4) & 511); - *(out + 5) = base + ((*in32 >> 13) & 511); - *(out + 6) = base + ((*in32 >> 22) & 511); - tmp = (*in32 >> 31); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 8)) << (9 - 8); - *(out + 7) = base + tmp; - /* remaining: 24 bits */ - return 9; -} - -static uint32_t -pack10_8(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 10; - tmp |= (*(in + 2) - base) << 20; - tmp |= (*(in + 3) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 3) - base) >> (10 - 8); - tmp |= (*(in + 4) - base) << 8; - tmp |= (*(in + 5) - base) << 18; - tmp |= (*(in + 6) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 6) - base) >> (10 - 6); - tmp |= (*(in + 7) - base) << 6; - /* remaining: 16 bits */ - length = (32 / 8) - (32 - 16) / 8; - /* consumed: 2 bytes (total: 10) */ - memcpy(out, &tmp, length); - return 10; -} - -static uint32_t -unpack10_8(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 1023); - *(out + 1) = base + ((*in32 >> 10) & 1023); - *(out + 2) = base + ((*in32 >> 20) & 1023); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 8)) << (10 - 8); - *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 8) & 1023); - *(out + 5) = base + ((*in32 >> 18) & 1023); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 6)) << (10 - 6); - *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 6) & 1023); - /* remaining: 16 bits */ - return 10; -} - -static uint32_t -pack11_8(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 11; - tmp |= (*(in + 2) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) >> (11 - 1); - tmp |= (*(in + 3) - base) << 1; - tmp |= (*(in + 4) - base) << 12; - tmp |= (*(in + 5) - base) << 23; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 5) - base) >> (11 - 2); - tmp |= (*(in + 6) - base) << 2; - tmp |= (*(in + 7) - base) << 13; - /* remaining: 8 bits */ - length = (32 / 8) - (32 - 24) / 8; - /* consumed: 3 bytes (total: 11) */ - memcpy(out, &tmp, length); - return 11; -} - -static uint32_t -unpack11_8(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 2047); - *(out + 1) = base + ((*in32 >> 11) & 2047); - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 1)) << (11 - 1); - *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 1) & 2047); - *(out + 4) = base + ((*in32 >> 12) & 2047); - tmp = (*in32 >> 23); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 2)) << (11 - 2); - *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 2) & 2047); - *(out + 7) = base + ((*in32 >> 13) & 2047); - /* remaining: 8 bits */ - return 11; -} - -static uint32_t -pack12_8(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 12; - tmp |= (*(in + 2) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) >> (12 - 4); - tmp |= (*(in + 3) - base) << 4; - tmp |= (*(in + 4) - base) << 16; - tmp |= (*(in + 5) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 5) - base) >> (12 - 8); - tmp |= (*(in + 6) - base) << 8; - tmp |= (*(in + 7) - base) << 20; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 12) */ - memcpy(out, &tmp, length); - return 12; -} - -static uint32_t -unpack12_8(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 4095); - *(out + 1) = base + ((*in32 >> 12) & 4095); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 4)) << (12 - 4); - *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 4) & 4095); - *(out + 4) = base + ((*in32 >> 16) & 4095); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 8)) << (12 - 8); - *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 8) & 4095); - *(out + 7) = base + ((*in32 >> 20) & 4095); - /* remaining: 0 bits */ - return 12; -} - -static uint32_t -pack13_8(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 13; - tmp |= (*(in + 2) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) >> (13 - 7); - tmp |= (*(in + 3) - base) << 7; - tmp |= (*(in + 4) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 4) - base) >> (13 - 1); - tmp |= (*(in + 5) - base) << 1; - tmp |= (*(in + 6) - base) << 14; - tmp |= (*(in + 7) - base) << 27; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 7) - base) >> (13 - 8); - /* remaining: 24 bits */ - length = (32 / 8) - (32 - 8) / 8; - /* consumed: 1 bytes (total: 13) */ - memcpy(out, &tmp, length); - return 13; -} - -static uint32_t -unpack13_8(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 8191); - *(out + 1) = base + ((*in32 >> 13) & 8191); - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 7)) << (13 - 7); - *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 7) & 8191); - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 1)) << (13 - 1); - *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 1) & 8191); - *(out + 6) = base + ((*in32 >> 14) & 8191); - tmp = (*in32 >> 27); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 8)) << (13 - 8); - *(out + 7) = base + tmp; - /* remaining: 24 bits */ - return 13; -} - -static uint32_t -pack14_8(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 14; - tmp |= (*(in + 2) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) >> (14 - 10); - tmp |= (*(in + 3) - base) << 10; - tmp |= (*(in + 4) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 4) - base) >> (14 - 6); - tmp |= (*(in + 5) - base) << 6; - tmp |= (*(in + 6) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 6) - base) >> (14 - 2); - tmp |= (*(in + 7) - base) << 2; - /* remaining: 16 bits */ - length = (32 / 8) - (32 - 16) / 8; - /* consumed: 2 bytes (total: 14) */ - memcpy(out, &tmp, length); - return 14; -} - -static uint32_t -unpack14_8(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 16383); - *(out + 1) = base + ((*in32 >> 14) & 16383); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 10)) << (14 - 10); - *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 10) & 16383); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 6)) << (14 - 6); - *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 6) & 16383); - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 2)) << (14 - 2); - *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 2) & 16383); - /* remaining: 16 bits */ - return 14; -} - -static uint32_t -pack15_8(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 15; - tmp |= (*(in + 2) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) >> (15 - 13); - tmp |= (*(in + 3) - base) << 13; - tmp |= (*(in + 4) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 4) - base) >> (15 - 11); - tmp |= (*(in + 5) - base) << 11; - tmp |= (*(in + 6) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 6) - base) >> (15 - 9); - tmp |= (*(in + 7) - base) << 9; - /* remaining: 8 bits */ - length = (32 / 8) - (32 - 24) / 8; - /* consumed: 3 bytes (total: 15) */ - memcpy(out, &tmp, length); - return 15; -} - -static uint32_t -unpack15_8(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 32767); - *(out + 1) = base + ((*in32 >> 15) & 32767); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 13)) << (15 - 13); - *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 13) & 32767); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 11)) << (15 - 11); - *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 11) & 32767); - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 9)) << (15 - 9); - *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 9) & 32767); - /* remaining: 8 bits */ - return 15; -} - -static uint32_t -pack16_8(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 2) - base) << 0; - tmp |= (*(in + 3) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 4) - base) << 0; - tmp |= (*(in + 5) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 6) - base) << 0; - tmp |= (*(in + 7) - base) << 16; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 16) */ - memcpy(out, &tmp, length); - return 16; -} - -static uint32_t -unpack16_8(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 65535); - *(out + 1) = base + ((*in32 >> 16) & 65535); - in32++; - /* consumed: 4 bytes (total: 8) */ - *(out + 2) = base + ((*in32 >> 0) & 65535); - *(out + 3) = base + ((*in32 >> 16) & 65535); - in32++; - /* consumed: 4 bytes (total: 12) */ - *(out + 4) = base + ((*in32 >> 0) & 65535); - *(out + 5) = base + ((*in32 >> 16) & 65535); - in32++; - /* consumed: 4 bytes (total: 16) */ - *(out + 6) = base + ((*in32 >> 0) & 65535); - *(out + 7) = base + ((*in32 >> 16) & 65535); - /* remaining: 0 bits */ - return 16; -} - -static uint32_t -pack17_8(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 17; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (17 - 2); - tmp |= (*(in + 2) - base) << 2; - tmp |= (*(in + 3) - base) << 19; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 3) - base) >> (17 - 4); - tmp |= (*(in + 4) - base) << 4; - tmp |= (*(in + 5) - base) << 21; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 5) - base) >> (17 - 6); - tmp |= (*(in + 6) - base) << 6; - tmp |= (*(in + 7) - base) << 23; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 7) - base) >> (17 - 8); - /* remaining: 24 bits */ - length = (32 / 8) - (32 - 8) / 8; - /* consumed: 1 bytes (total: 17) */ - memcpy(out, &tmp, length); - return 17; -} - -static uint32_t -unpack17_8(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 131071); - tmp = (*in32 >> 17); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 2)) << (17 - 2); - *(out + 1) = base + tmp; - *(out + 2) = base + ((*in32 >> 2) & 131071); - tmp = (*in32 >> 19); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 4)) << (17 - 4); - *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 4) & 131071); - tmp = (*in32 >> 21); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 6)) << (17 - 6); - *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 6) & 131071); - tmp = (*in32 >> 23); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 8)) << (17 - 8); - *(out + 7) = base + tmp; - /* remaining: 24 bits */ - return 17; -} - -static uint32_t -pack18_8(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 18; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (18 - 4); - tmp |= (*(in + 2) - base) << 4; - tmp |= (*(in + 3) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 3) - base) >> (18 - 8); - tmp |= (*(in + 4) - base) << 8; - tmp |= (*(in + 5) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 5) - base) >> (18 - 12); - tmp |= (*(in + 6) - base) << 12; - tmp |= (*(in + 7) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 7) - base) >> (18 - 16); - /* remaining: 16 bits */ - length = (32 / 8) - (32 - 16) / 8; - /* consumed: 2 bytes (total: 18) */ - memcpy(out, &tmp, length); - return 18; -} - -static uint32_t -unpack18_8(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 262143); - tmp = (*in32 >> 18); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 4)) << (18 - 4); - *(out + 1) = base + tmp; - *(out + 2) = base + ((*in32 >> 4) & 262143); - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 8)) << (18 - 8); - *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 8) & 262143); - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 12)) << (18 - 12); - *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 12) & 262143); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 16)) << (18 - 16); - *(out + 7) = base + tmp; - /* remaining: 16 bits */ - return 18; -} - -static uint32_t -pack19_8(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 19; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (19 - 6); - tmp |= (*(in + 2) - base) << 6; - tmp |= (*(in + 3) - base) << 25; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 3) - base) >> (19 - 12); - tmp |= (*(in + 4) - base) << 12; - tmp |= (*(in + 5) - base) << 31; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 5) - base) >> (19 - 18); - tmp |= (*(in + 6) - base) << 18; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 6) - base) >> (19 - 5); - tmp |= (*(in + 7) - base) << 5; - /* remaining: 8 bits */ - length = (32 / 8) - (32 - 24) / 8; - /* consumed: 3 bytes (total: 19) */ - memcpy(out, &tmp, length); - return 19; -} - -static uint32_t -unpack19_8(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 524287); - tmp = (*in32 >> 19); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 6)) << (19 - 6); - *(out + 1) = base + tmp; - *(out + 2) = base + ((*in32 >> 6) & 524287); - tmp = (*in32 >> 25); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 12)) << (19 - 12); - *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 12) & 524287); - tmp = (*in32 >> 31); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 18)) << (19 - 18); - *(out + 5) = base + tmp; - tmp = (*in32 >> 18); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 5)) << (19 - 5); - *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 5) & 524287); - /* remaining: 8 bits */ - return 19; -} - -static uint32_t -pack20_8(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (20 - 8); - tmp |= (*(in + 2) - base) << 8; - tmp |= (*(in + 3) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 3) - base) >> (20 - 16); - tmp |= (*(in + 4) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 4) - base) >> (20 - 4); - tmp |= (*(in + 5) - base) << 4; - tmp |= (*(in + 6) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 6) - base) >> (20 - 12); - tmp |= (*(in + 7) - base) << 12; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 20) */ - memcpy(out, &tmp, length); - return 20; -} - -static uint32_t -unpack20_8(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 1048575); - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 8)) << (20 - 8); - *(out + 1) = base + tmp; - *(out + 2) = base + ((*in32 >> 8) & 1048575); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 16)) << (20 - 16); - *(out + 3) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 4)) << (20 - 4); - *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 4) & 1048575); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 12)) << (20 - 12); - *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 12) & 1048575); - /* remaining: 0 bits */ - return 20; -} - -static uint32_t -pack21_8(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 21; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (21 - 10); - tmp |= (*(in + 2) - base) << 10; - tmp |= (*(in + 3) - base) << 31; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 3) - base) >> (21 - 20); - tmp |= (*(in + 4) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 4) - base) >> (21 - 9); - tmp |= (*(in + 5) - base) << 9; - tmp |= (*(in + 6) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 6) - base) >> (21 - 19); - tmp |= (*(in + 7) - base) << 19; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 7) - base) >> (21 - 8); - /* remaining: 24 bits */ - length = (32 / 8) - (32 - 8) / 8; - /* consumed: 1 bytes (total: 21) */ - memcpy(out, &tmp, length); - return 21; -} - -static uint32_t -unpack21_8(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 2097151); - tmp = (*in32 >> 21); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 10)) << (21 - 10); - *(out + 1) = base + tmp; - *(out + 2) = base + ((*in32 >> 10) & 2097151); - tmp = (*in32 >> 31); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 20)) << (21 - 20); - *(out + 3) = base + tmp; - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 9)) << (21 - 9); - *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 9) & 2097151); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 19)) << (21 - 19); - *(out + 6) = base + tmp; - tmp = (*in32 >> 19); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 8)) << (21 - 8); - *(out + 7) = base + tmp; - /* remaining: 24 bits */ - return 21; -} - -static uint32_t -pack22_8(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (22 - 12); - tmp |= (*(in + 2) - base) << 12; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (22 - 2); - tmp |= (*(in + 3) - base) << 2; - tmp |= (*(in + 4) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 4) - base) >> (22 - 14); - tmp |= (*(in + 5) - base) << 14; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 5) - base) >> (22 - 4); - tmp |= (*(in + 6) - base) << 4; - tmp |= (*(in + 7) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 7) - base) >> (22 - 16); - /* remaining: 16 bits */ - length = (32 / 8) - (32 - 16) / 8; - /* consumed: 2 bytes (total: 22) */ - memcpy(out, &tmp, length); - return 22; -} - -static uint32_t -unpack22_8(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 4194303); - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 12)) << (22 - 12); - *(out + 1) = base + tmp; - tmp = (*in32 >> 12); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 2)) << (22 - 2); - *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 2) & 4194303); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 14)) << (22 - 14); - *(out + 4) = base + tmp; - tmp = (*in32 >> 14); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 4)) << (22 - 4); - *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 4) & 4194303); - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 16)) << (22 - 16); - *(out + 7) = base + tmp; - /* remaining: 16 bits */ - return 22; -} - -static uint32_t -pack23_8(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 23; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (23 - 14); - tmp |= (*(in + 2) - base) << 14; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (23 - 5); - tmp |= (*(in + 3) - base) << 5; - tmp |= (*(in + 4) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 4) - base) >> (23 - 19); - tmp |= (*(in + 5) - base) << 19; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 5) - base) >> (23 - 10); - tmp |= (*(in + 6) - base) << 10; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 6) - base) >> (23 - 1); - tmp |= (*(in + 7) - base) << 1; - /* remaining: 8 bits */ - length = (32 / 8) - (32 - 24) / 8; - /* consumed: 3 bytes (total: 23) */ - memcpy(out, &tmp, length); - return 23; -} - -static uint32_t -unpack23_8(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 8388607); - tmp = (*in32 >> 23); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 14)) << (23 - 14); - *(out + 1) = base + tmp; - tmp = (*in32 >> 14); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 5)) << (23 - 5); - *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 5) & 8388607); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 19)) << (23 - 19); - *(out + 4) = base + tmp; - tmp = (*in32 >> 19); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 10)) << (23 - 10); - *(out + 5) = base + tmp; - tmp = (*in32 >> 10); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 1)) << (23 - 1); - *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 1) & 8388607); - /* remaining: 8 bits */ - return 23; -} - -static uint32_t -pack24_8(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (24 - 16); - tmp |= (*(in + 2) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (24 - 8); - tmp |= (*(in + 3) - base) << 8; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 4) - base) << 0; - tmp |= (*(in + 5) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 5) - base) >> (24 - 16); - tmp |= (*(in + 6) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 6) - base) >> (24 - 8); - tmp |= (*(in + 7) - base) << 8; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 24) */ - memcpy(out, &tmp, length); - return 24; -} - -static uint32_t -unpack24_8(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 16777215); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 16)) << (24 - 16); - *(out + 1) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 8)) << (24 - 8); - *(out + 2) = base + tmp; - *(out + 3) = base + ((*in32 >> 8) & 16777215); - in32++; - /* consumed: 4 bytes (total: 16) */ - *(out + 4) = base + ((*in32 >> 0) & 16777215); - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 16)) << (24 - 16); - *(out + 5) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 8)) << (24 - 8); - *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 8) & 16777215); - /* remaining: 0 bits */ - return 24; -} - -static uint32_t -pack25_8(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 25; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (25 - 18); - tmp |= (*(in + 2) - base) << 18; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (25 - 11); - tmp |= (*(in + 3) - base) << 11; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (25 - 4); - tmp |= (*(in + 4) - base) << 4; - tmp |= (*(in + 5) - base) << 29; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 5) - base) >> (25 - 22); - tmp |= (*(in + 6) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 6) - base) >> (25 - 15); - tmp |= (*(in + 7) - base) << 15; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 7) - base) >> (25 - 8); - /* remaining: 24 bits */ - length = (32 / 8) - (32 - 8) / 8; - /* consumed: 1 bytes (total: 25) */ - memcpy(out, &tmp, length); - return 25; -} - -static uint32_t -unpack25_8(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 33554431); - tmp = (*in32 >> 25); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 18)) << (25 - 18); - *(out + 1) = base + tmp; - tmp = (*in32 >> 18); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 11)) << (25 - 11); - *(out + 2) = base + tmp; - tmp = (*in32 >> 11); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 4)) << (25 - 4); - *(out + 3) = base + tmp; - *(out + 4) = base + ((*in32 >> 4) & 33554431); - tmp = (*in32 >> 29); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 22)) << (25 - 22); - *(out + 5) = base + tmp; - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 15)) << (25 - 15); - *(out + 6) = base + tmp; - tmp = (*in32 >> 15); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 8)) << (25 - 8); - *(out + 7) = base + tmp; - /* remaining: 24 bits */ - return 25; -} - -static uint32_t -pack26_8(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (26 - 20); - tmp |= (*(in + 2) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (26 - 14); - tmp |= (*(in + 3) - base) << 14; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (26 - 8); - tmp |= (*(in + 4) - base) << 8; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (26 - 2); - tmp |= (*(in + 5) - base) << 2; - tmp |= (*(in + 6) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 6) - base) >> (26 - 22); - tmp |= (*(in + 7) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 7) - base) >> (26 - 16); - /* remaining: 16 bits */ - length = (32 / 8) - (32 - 16) / 8; - /* consumed: 2 bytes (total: 26) */ - memcpy(out, &tmp, length); - return 26; -} - -static uint32_t -unpack26_8(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 67108863); - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 20)) << (26 - 20); - *(out + 1) = base + tmp; - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 14)) << (26 - 14); - *(out + 2) = base + tmp; - tmp = (*in32 >> 14); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 8)) << (26 - 8); - *(out + 3) = base + tmp; - tmp = (*in32 >> 8); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 2)) << (26 - 2); - *(out + 4) = base + tmp; - *(out + 5) = base + ((*in32 >> 2) & 67108863); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 22)) << (26 - 22); - *(out + 6) = base + tmp; - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 16)) << (26 - 16); - *(out + 7) = base + tmp; - /* remaining: 16 bits */ - return 26; -} - -static uint32_t -pack27_8(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 27; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (27 - 22); - tmp |= (*(in + 2) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (27 - 17); - tmp |= (*(in + 3) - base) << 17; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (27 - 12); - tmp |= (*(in + 4) - base) << 12; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (27 - 7); - tmp |= (*(in + 5) - base) << 7; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 5) - base) >> (27 - 2); - tmp |= (*(in + 6) - base) << 2; - tmp |= (*(in + 7) - base) << 29; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 7) - base) >> (27 - 24); - /* remaining: 8 bits */ - length = (32 / 8) - (32 - 24) / 8; - /* consumed: 3 bytes (total: 27) */ - memcpy(out, &tmp, length); - return 27; -} - -static uint32_t -unpack27_8(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 134217727); - tmp = (*in32 >> 27); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 22)) << (27 - 22); - *(out + 1) = base + tmp; - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 17)) << (27 - 17); - *(out + 2) = base + tmp; - tmp = (*in32 >> 17); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 12)) << (27 - 12); - *(out + 3) = base + tmp; - tmp = (*in32 >> 12); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 7)) << (27 - 7); - *(out + 4) = base + tmp; - tmp = (*in32 >> 7); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 2)) << (27 - 2); - *(out + 5) = base + tmp; - *(out + 6) = base + ((*in32 >> 2) & 134217727); - tmp = (*in32 >> 29); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 24)) << (27 - 24); - *(out + 7) = base + tmp; - /* remaining: 8 bits */ - return 27; -} - -static uint32_t -pack28_8(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (28 - 24); - tmp |= (*(in + 2) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (28 - 20); - tmp |= (*(in + 3) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (28 - 16); - tmp |= (*(in + 4) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (28 - 12); - tmp |= (*(in + 5) - base) << 12; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 5) - base) >> (28 - 8); - tmp |= (*(in + 6) - base) << 8; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 6) - base) >> (28 - 4); - tmp |= (*(in + 7) - base) << 4; - /* remaining: 0 bits */ - length = (32 / 8) - (32 - 32) / 8; - /* consumed: 4 bytes (total: 28) */ - memcpy(out, &tmp, length); - return 28; -} - -static uint32_t -unpack28_8(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 268435455); - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 24)) << (28 - 24); - *(out + 1) = base + tmp; - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 20)) << (28 - 20); - *(out + 2) = base + tmp; - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 16)) << (28 - 16); - *(out + 3) = base + tmp; - tmp = (*in32 >> 16); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 12)) << (28 - 12); - *(out + 4) = base + tmp; - tmp = (*in32 >> 12); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 8)) << (28 - 8); - *(out + 5) = base + tmp; - tmp = (*in32 >> 8); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 4)) << (28 - 4); - *(out + 6) = base + tmp; - *(out + 7) = base + ((*in32 >> 4) & 268435455); - /* remaining: 0 bits */ - return 28; -} - -static uint32_t -pack29_8(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 29; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (29 - 26); - tmp |= (*(in + 2) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (29 - 23); - tmp |= (*(in + 3) - base) << 23; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (29 - 20); - tmp |= (*(in + 4) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (29 - 17); - tmp |= (*(in + 5) - base) << 17; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 5) - base) >> (29 - 14); - tmp |= (*(in + 6) - base) << 14; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 6) - base) >> (29 - 11); - tmp |= (*(in + 7) - base) << 11; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 7) - base) >> (29 - 8); - /* remaining: 24 bits */ - length = (32 / 8) - (32 - 8) / 8; - /* consumed: 1 bytes (total: 29) */ - memcpy(out, &tmp, length); - return 29; -} - -static uint32_t -unpack29_8(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 536870911); - tmp = (*in32 >> 29); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 26)) << (29 - 26); - *(out + 1) = base + tmp; - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 23)) << (29 - 23); - *(out + 2) = base + tmp; - tmp = (*in32 >> 23); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 20)) << (29 - 20); - *(out + 3) = base + tmp; - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 17)) << (29 - 17); - *(out + 4) = base + tmp; - tmp = (*in32 >> 17); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 14)) << (29 - 14); - *(out + 5) = base + tmp; - tmp = (*in32 >> 14); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 11)) << (29 - 11); - *(out + 6) = base + tmp; - tmp = (*in32 >> 11); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 8)) << (29 - 8); - *(out + 7) = base + tmp; - /* remaining: 24 bits */ - return 29; -} - -static uint32_t -pack30_8(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (30 - 28); - tmp |= (*(in + 2) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (30 - 26); - tmp |= (*(in + 3) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (30 - 24); - tmp |= (*(in + 4) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (30 - 22); - tmp |= (*(in + 5) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 5) - base) >> (30 - 20); - tmp |= (*(in + 6) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 6) - base) >> (30 - 18); - tmp |= (*(in + 7) - base) << 18; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 7) - base) >> (30 - 16); - /* remaining: 16 bits */ - length = (32 / 8) - (32 - 16) / 8; - /* consumed: 2 bytes (total: 30) */ - memcpy(out, &tmp, length); - return 30; -} - -static uint32_t -unpack30_8(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 1073741823); - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 28)) << (30 - 28); - *(out + 1) = base + tmp; - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 26)) << (30 - 26); - *(out + 2) = base + tmp; - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 24)) << (30 - 24); - *(out + 3) = base + tmp; - tmp = (*in32 >> 24); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 22)) << (30 - 22); - *(out + 4) = base + tmp; - tmp = (*in32 >> 22); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 20)) << (30 - 20); - *(out + 5) = base + tmp; - tmp = (*in32 >> 20); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 18)) << (30 - 18); - *(out + 6) = base + tmp; - tmp = (*in32 >> 18); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 16)) << (30 - 16); - *(out + 7) = base + tmp; - /* remaining: 16 bits */ - return 30; -} - -static uint32_t -pack31_8(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t tmp, length; - tmp = (*(in + 0) - base) << 0; - tmp |= (*(in + 1) - base) << 31; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 4) */ - tmp = (*(in + 1) - base) >> (31 - 30); - tmp |= (*(in + 2) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = (*(in + 2) - base) >> (31 - 29); - tmp |= (*(in + 3) - base) << 29; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = (*(in + 3) - base) >> (31 - 28); - tmp |= (*(in + 4) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = (*(in + 4) - base) >> (31 - 27); - tmp |= (*(in + 5) - base) << 27; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = (*(in + 5) - base) >> (31 - 26); - tmp |= (*(in + 6) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = (*(in + 6) - base) >> (31 - 25); - tmp |= (*(in + 7) - base) << 25; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = (*(in + 7) - base) >> (31 - 24); - /* remaining: 8 bits */ - length = (32 / 8) - (32 - 24) / 8; - /* consumed: 3 bytes (total: 31) */ - memcpy(out, &tmp, length); - return 31; -} - -static uint32_t -unpack31_8(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t *in32 = (uint32_t *)in; - uint32_t tmp; (void)tmp; - *(out + 0) = base + ((*in32 >> 0) & 2147483647); - tmp = (*in32 >> 31); - in32++; - /* consumed: 4 bytes (total: 8) */ - tmp |= (*in32 % (1U << 30)) << (31 - 30); - *(out + 1) = base + tmp; - tmp = (*in32 >> 30); - in32++; - /* consumed: 4 bytes (total: 12) */ - tmp |= (*in32 % (1U << 29)) << (31 - 29); - *(out + 2) = base + tmp; - tmp = (*in32 >> 29); - in32++; - /* consumed: 4 bytes (total: 16) */ - tmp |= (*in32 % (1U << 28)) << (31 - 28); - *(out + 3) = base + tmp; - tmp = (*in32 >> 28); - in32++; - /* consumed: 4 bytes (total: 20) */ - tmp |= (*in32 % (1U << 27)) << (31 - 27); - *(out + 4) = base + tmp; - tmp = (*in32 >> 27); - in32++; - /* consumed: 4 bytes (total: 24) */ - tmp |= (*in32 % (1U << 26)) << (31 - 26); - *(out + 5) = base + tmp; - tmp = (*in32 >> 26); - in32++; - /* consumed: 4 bytes (total: 28) */ - tmp |= (*in32 % (1U << 25)) << (31 - 25); - *(out + 6) = base + tmp; - tmp = (*in32 >> 25); - in32++; - /* consumed: 4 bytes (total: 32) */ - tmp |= (*in32 % (1U << 24)) << (31 - 24); - *(out + 7) = base + tmp; - /* remaining: 8 bits */ - return 31; -} - -static uint32_t -pack32_8(uint32_t base, const uint32_t *in, uint8_t *out) { - uint32_t i; - uint32_t *out32 = (uint32_t *)out; - for (i = 0; i < 8; i++) - out32[i] = in[i] - base; - return 8 * sizeof(uint32_t); -} - -static uint32_t -unpack32_8(uint32_t base, const uint8_t *in, uint32_t *out) { - uint32_t i; - uint32_t *in32 = (uint32_t *)in; - for (i = 0; i < 8; i++) - out[i] = base + in32[i]; - return 8 * sizeof(uint32_t); -} - -for_packfunc_t for_pack8[33] = { - pack0_n, - pack1_8, - pack2_8, - pack3_8, - pack4_8, - pack5_8, - pack6_8, - pack7_8, - pack8_8, - pack9_8, - pack10_8, - pack11_8, - pack12_8, - pack13_8, - pack14_8, - pack15_8, - pack16_8, - pack17_8, - pack18_8, - pack19_8, - pack20_8, - pack21_8, - pack22_8, - pack23_8, - pack24_8, - pack25_8, - pack26_8, - pack27_8, - pack28_8, - pack29_8, - pack30_8, - pack31_8, - pack32_8 -}; - -for_unpackfunc_t for_unpack8[33] = { - unpack0_n, - unpack1_8, - unpack2_8, - unpack3_8, - unpack4_8, - unpack5_8, - unpack6_8, - unpack7_8, - unpack8_8, - unpack9_8, - unpack10_8, - unpack11_8, - unpack12_8, - unpack13_8, - unpack14_8, - unpack15_8, - unpack16_8, - unpack17_8, - unpack18_8, - unpack19_8, - unpack20_8, - unpack21_8, - unpack22_8, - unpack23_8, - unpack24_8, - unpack25_8, - unpack26_8, - unpack27_8, - unpack28_8, - unpack29_8, - unpack30_8, - unpack31_8, - unpack32_8 -}; - -static uint32_t -pack1_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { - uint32_t tmp, remaining; - if (length == 0) - return 0; - tmp = (*(in + 0) - base) << 0; - if (length == 1) - goto bail; - tmp |= (*(in + 1) - base) << 1; - if (length == 2) - goto bail; - tmp |= (*(in + 2) - base) << 2; - if (length == 3) - goto bail; - tmp |= (*(in + 3) - base) << 3; - if (length == 4) - goto bail; - tmp |= (*(in + 4) - base) << 4; - if (length == 5) - goto bail; - tmp |= (*(in + 5) - base) << 5; - if (length == 6) - goto bail; - tmp |= (*(in + 6) - base) << 6; - if (length == 7) - goto bail; - tmp |= (*(in + 7) - base) << 7; - if (length == 8) - goto bail; -bail: - remaining = (((length * 1) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; - memcpy(out, &tmp, remaining); - return ((length * 1) + 7) / 8; -} - -static uint32_t -unpack1_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { - uint32_t tmp; - if (length == 0) - return 0; - tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 1); - if (length == 1) - goto bail; - *(out + 1) = base + ((tmp >> 1) & 1); - if (length == 2) - goto bail; - *(out + 2) = base + ((tmp >> 2) & 1); - if (length == 3) - goto bail; - *(out + 3) = base + ((tmp >> 3) & 1); - if (length == 4) - goto bail; - *(out + 4) = base + ((tmp >> 4) & 1); - if (length == 5) - goto bail; - *(out + 5) = base + ((tmp >> 5) & 1); - if (length == 6) - goto bail; - *(out + 6) = base + ((tmp >> 6) & 1); - if (length == 7) - goto bail; - *(out + 7) = base + ((tmp >> 7) & 1); - if (length == 8) - goto bail; -bail: - return ((length * 1) + 7) / 8; -} - -static uint32_t -pack2_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { - uint32_t tmp, remaining; - if (length == 0) - return 0; - tmp = (*(in + 0) - base) << 0; - if (length == 1) - goto bail; - tmp |= (*(in + 1) - base) << 2; - if (length == 2) - goto bail; - tmp |= (*(in + 2) - base) << 4; - if (length == 3) - goto bail; - tmp |= (*(in + 3) - base) << 6; - if (length == 4) - goto bail; - tmp |= (*(in + 4) - base) << 8; - if (length == 5) - goto bail; - tmp |= (*(in + 5) - base) << 10; - if (length == 6) - goto bail; - tmp |= (*(in + 6) - base) << 12; - if (length == 7) - goto bail; - tmp |= (*(in + 7) - base) << 14; - if (length == 8) - goto bail; -bail: - remaining = (((length * 2) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; - memcpy(out, &tmp, remaining); - return ((length * 2) + 7) / 8; -} - -static uint32_t -unpack2_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { - uint32_t tmp; - if (length == 0) - return 0; - tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 3); - if (length == 1) - goto bail; - *(out + 1) = base + ((tmp >> 2) & 3); - if (length == 2) - goto bail; - *(out + 2) = base + ((tmp >> 4) & 3); - if (length == 3) - goto bail; - *(out + 3) = base + ((tmp >> 6) & 3); - if (length == 4) - goto bail; - *(out + 4) = base + ((tmp >> 8) & 3); - if (length == 5) - goto bail; - *(out + 5) = base + ((tmp >> 10) & 3); - if (length == 6) - goto bail; - *(out + 6) = base + ((tmp >> 12) & 3); - if (length == 7) - goto bail; - *(out + 7) = base + ((tmp >> 14) & 3); - if (length == 8) - goto bail; -bail: - return ((length * 2) + 7) / 8; -} - -static uint32_t -pack3_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { - uint32_t tmp, remaining; - if (length == 0) - return 0; - tmp = (*(in + 0) - base) << 0; - if (length == 1) - goto bail; - tmp |= (*(in + 1) - base) << 3; - if (length == 2) - goto bail; - tmp |= (*(in + 2) - base) << 6; - if (length == 3) - goto bail; - tmp |= (*(in + 3) - base) << 9; - if (length == 4) - goto bail; - tmp |= (*(in + 4) - base) << 12; - if (length == 5) - goto bail; - tmp |= (*(in + 5) - base) << 15; - if (length == 6) - goto bail; - tmp |= (*(in + 6) - base) << 18; - if (length == 7) - goto bail; - tmp |= (*(in + 7) - base) << 21; - if (length == 8) - goto bail; -bail: - remaining = (((length * 3) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; - memcpy(out, &tmp, remaining); - return ((length * 3) + 7) / 8; -} - -static uint32_t -unpack3_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { - uint32_t tmp; - if (length == 0) - return 0; - tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 7); - if (length == 1) - goto bail; - *(out + 1) = base + ((tmp >> 3) & 7); - if (length == 2) - goto bail; - *(out + 2) = base + ((tmp >> 6) & 7); - if (length == 3) - goto bail; - *(out + 3) = base + ((tmp >> 9) & 7); - if (length == 4) - goto bail; - *(out + 4) = base + ((tmp >> 12) & 7); - if (length == 5) - goto bail; - *(out + 5) = base + ((tmp >> 15) & 7); - if (length == 6) - goto bail; - *(out + 6) = base + ((tmp >> 18) & 7); - if (length == 7) - goto bail; - *(out + 7) = base + ((tmp >> 21) & 7); - if (length == 8) - goto bail; -bail: - return ((length * 3) + 7) / 8; -} - -static uint32_t -pack4_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { - uint32_t tmp, remaining; - if (length == 0) - return 0; - tmp = (*(in + 0) - base) << 0; - if (length == 1) - goto bail; - tmp |= (*(in + 1) - base) << 4; - if (length == 2) - goto bail; - tmp |= (*(in + 2) - base) << 8; - if (length == 3) - goto bail; - tmp |= (*(in + 3) - base) << 12; - if (length == 4) - goto bail; - tmp |= (*(in + 4) - base) << 16; - if (length == 5) - goto bail; - tmp |= (*(in + 5) - base) << 20; - if (length == 6) - goto bail; - tmp |= (*(in + 6) - base) << 24; - if (length == 7) - goto bail; - tmp |= (*(in + 7) - base) << 28; - if (length == 8) - goto bail; -bail: - remaining = (((length * 4) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; - memcpy(out, &tmp, remaining); - return ((length * 4) + 7) / 8; -} - -static uint32_t -unpack4_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { - uint32_t tmp; - if (length == 0) - return 0; - tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 15); - if (length == 1) - goto bail; - *(out + 1) = base + ((tmp >> 4) & 15); - if (length == 2) - goto bail; - *(out + 2) = base + ((tmp >> 8) & 15); - if (length == 3) - goto bail; - *(out + 3) = base + ((tmp >> 12) & 15); - if (length == 4) - goto bail; - *(out + 4) = base + ((tmp >> 16) & 15); - if (length == 5) - goto bail; - *(out + 5) = base + ((tmp >> 20) & 15); - if (length == 6) - goto bail; - *(out + 6) = base + ((tmp >> 24) & 15); - if (length == 7) - goto bail; - *(out + 7) = base + ((tmp >> 28) & 15); - if (length == 8) - goto bail; -bail: - return ((length * 4) + 7) / 8; -} - -static uint32_t -pack5_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { - uint32_t tmp, remaining; - if (length == 0) - return 0; - tmp = (*(in + 0) - base) << 0; - if (length == 1) - goto bail; - tmp |= (*(in + 1) - base) << 5; - if (length == 2) - goto bail; - tmp |= (*(in + 2) - base) << 10; - if (length == 3) - goto bail; - tmp |= (*(in + 3) - base) << 15; - if (length == 4) - goto bail; - tmp |= (*(in + 4) - base) << 20; - if (length == 5) - goto bail; - tmp |= (*(in + 5) - base) << 25; - if (length == 6) - goto bail; - tmp |= (*(in + 6) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 6) - base) >> (5 - 3); - if (length == 7) - goto bail; - tmp |= (*(in + 7) - base) << 3; - if (length == 8) - goto bail; -bail: - remaining = (((length * 5) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; - memcpy(out, &tmp, remaining); - return ((length * 5) + 7) / 8; -} - -static uint32_t -unpack5_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { - uint32_t tmp; - if (length == 0) - return 0; - tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 31); - if (length == 1) - goto bail; - *(out + 1) = base + ((tmp >> 5) & 31); - if (length == 2) - goto bail; - *(out + 2) = base + ((tmp >> 10) & 31); - if (length == 3) - goto bail; - *(out + 3) = base + ((tmp >> 15) & 31); - if (length == 4) - goto bail; - *(out + 4) = base + ((tmp >> 20) & 31); - if (length == 5) - goto bail; - *(out + 5) = base + ((tmp >> 25) & 31); - if (length == 6) - goto bail; - *(out + 6) = tmp >> 30; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 6) |= (tmp % (1U << 3)) << (5 - 3); - *(out + 6) += base; - if (length == 7) - goto bail; - *(out + 7) = base + ((tmp >> 3) & 31); - if (length == 8) - goto bail; -bail: - return ((length * 5) + 7) / 8; -} - -static uint32_t -pack6_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { - uint32_t tmp, remaining; - if (length == 0) - return 0; - tmp = (*(in + 0) - base) << 0; - if (length == 1) - goto bail; - tmp |= (*(in + 1) - base) << 6; - if (length == 2) - goto bail; - tmp |= (*(in + 2) - base) << 12; - if (length == 3) - goto bail; - tmp |= (*(in + 3) - base) << 18; - if (length == 4) - goto bail; - tmp |= (*(in + 4) - base) << 24; - if (length == 5) - goto bail; - tmp |= (*(in + 5) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 5) - base) >> (6 - 4); - if (length == 6) - goto bail; - tmp |= (*(in + 6) - base) << 4; - if (length == 7) - goto bail; - tmp |= (*(in + 7) - base) << 10; - if (length == 8) - goto bail; -bail: - remaining = (((length * 6) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; - memcpy(out, &tmp, remaining); - return ((length * 6) + 7) / 8; -} - -static uint32_t -unpack6_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { - uint32_t tmp; - if (length == 0) - return 0; - tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 63); - if (length == 1) - goto bail; - *(out + 1) = base + ((tmp >> 6) & 63); - if (length == 2) - goto bail; - *(out + 2) = base + ((tmp >> 12) & 63); - if (length == 3) - goto bail; - *(out + 3) = base + ((tmp >> 18) & 63); - if (length == 4) - goto bail; - *(out + 4) = base + ((tmp >> 24) & 63); - if (length == 5) - goto bail; - *(out + 5) = tmp >> 30; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 5) |= (tmp % (1U << 4)) << (6 - 4); - *(out + 5) += base; - if (length == 6) - goto bail; - *(out + 6) = base + ((tmp >> 4) & 63); - if (length == 7) - goto bail; - *(out + 7) = base + ((tmp >> 10) & 63); - if (length == 8) - goto bail; -bail: - return ((length * 6) + 7) / 8; -} - -static uint32_t -pack7_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { - uint32_t tmp, remaining; - if (length == 0) - return 0; - tmp = (*(in + 0) - base) << 0; - if (length == 1) - goto bail; - tmp |= (*(in + 1) - base) << 7; - if (length == 2) - goto bail; - tmp |= (*(in + 2) - base) << 14; - if (length == 3) - goto bail; - tmp |= (*(in + 3) - base) << 21; - if (length == 4) - goto bail; - tmp |= (*(in + 4) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 4) - base) >> (7 - 3); - if (length == 5) - goto bail; - tmp |= (*(in + 5) - base) << 3; - if (length == 6) - goto bail; - tmp |= (*(in + 6) - base) << 10; - if (length == 7) - goto bail; - tmp |= (*(in + 7) - base) << 17; - if (length == 8) - goto bail; -bail: - remaining = (((length * 7) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; - memcpy(out, &tmp, remaining); - return ((length * 7) + 7) / 8; -} - -static uint32_t -unpack7_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { - uint32_t tmp; - if (length == 0) - return 0; - tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 127); - if (length == 1) - goto bail; - *(out + 1) = base + ((tmp >> 7) & 127); - if (length == 2) - goto bail; - *(out + 2) = base + ((tmp >> 14) & 127); - if (length == 3) - goto bail; - *(out + 3) = base + ((tmp >> 21) & 127); - if (length == 4) - goto bail; - *(out + 4) = tmp >> 28; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 4) |= (tmp % (1U << 3)) << (7 - 3); - *(out + 4) += base; - if (length == 5) - goto bail; - *(out + 5) = base + ((tmp >> 3) & 127); - if (length == 6) - goto bail; - *(out + 6) = base + ((tmp >> 10) & 127); - if (length == 7) - goto bail; - *(out + 7) = base + ((tmp >> 17) & 127); - if (length == 8) - goto bail; -bail: - return ((length * 7) + 7) / 8; -} - -static uint32_t -pack8_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { - uint32_t tmp, remaining; - if (length == 0) - return 0; - tmp = (*(in + 0) - base) << 0; - if (length == 1) - goto bail; - tmp |= (*(in + 1) - base) << 8; - if (length == 2) - goto bail; - tmp |= (*(in + 2) - base) << 16; - if (length == 3) - goto bail; - tmp |= (*(in + 3) - base) << 24; - if (length == 4) - goto bail; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 4) - base) << 0; - if (length == 5) - goto bail; - tmp |= (*(in + 5) - base) << 8; - if (length == 6) - goto bail; - tmp |= (*(in + 6) - base) << 16; - if (length == 7) - goto bail; - tmp |= (*(in + 7) - base) << 24; - if (length == 8) - goto bail; -bail: - remaining = (((length * 8) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; - memcpy(out, &tmp, remaining); - return ((length * 8) + 7) / 8; -} - -static uint32_t -unpack8_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { - uint32_t tmp; - if (length == 0) - return 0; - tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 255); - if (length == 1) - goto bail; - *(out + 1) = base + ((tmp >> 8) & 255); - if (length == 2) - goto bail; - *(out + 2) = base + ((tmp >> 16) & 255); - if (length == 3) - goto bail; - *(out + 3) = base + ((tmp >> 24) & 255); - if (length == 4) - goto bail; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 4) = base + ((tmp >> 0) & 255); - if (length == 5) - goto bail; - *(out + 5) = base + ((tmp >> 8) & 255); - if (length == 6) - goto bail; - *(out + 6) = base + ((tmp >> 16) & 255); - if (length == 7) - goto bail; - *(out + 7) = base + ((tmp >> 24) & 255); - if (length == 8) - goto bail; -bail: - return ((length * 8) + 7) / 8; -} - -static uint32_t -pack9_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { - uint32_t tmp, remaining; - if (length == 0) - return 0; - tmp = (*(in + 0) - base) << 0; - if (length == 1) - goto bail; - tmp |= (*(in + 1) - base) << 9; - if (length == 2) - goto bail; - tmp |= (*(in + 2) - base) << 18; - if (length == 3) - goto bail; - tmp |= (*(in + 3) - base) << 27; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 3) - base) >> (9 - 4); - if (length == 4) - goto bail; - tmp |= (*(in + 4) - base) << 4; - if (length == 5) - goto bail; - tmp |= (*(in + 5) - base) << 13; - if (length == 6) - goto bail; - tmp |= (*(in + 6) - base) << 22; - if (length == 7) - goto bail; - tmp |= (*(in + 7) - base) << 31; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 7) - base) >> (9 - 8); - if (length == 8) - goto bail; -bail: - remaining = (((length * 9) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; - memcpy(out, &tmp, remaining); - return ((length * 9) + 7) / 8; -} - -static uint32_t -unpack9_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { - uint32_t tmp; - if (length == 0) - return 0; - tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 511); - if (length == 1) - goto bail; - *(out + 1) = base + ((tmp >> 9) & 511); - if (length == 2) - goto bail; - *(out + 2) = base + ((tmp >> 18) & 511); - if (length == 3) - goto bail; - *(out + 3) = tmp >> 27; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 3) |= (tmp % (1U << 4)) << (9 - 4); - *(out + 3) += base; - if (length == 4) - goto bail; - *(out + 4) = base + ((tmp >> 4) & 511); - if (length == 5) - goto bail; - *(out + 5) = base + ((tmp >> 13) & 511); - if (length == 6) - goto bail; - *(out + 6) = base + ((tmp >> 22) & 511); - if (length == 7) - goto bail; - *(out + 7) = tmp >> 31; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 7) |= (tmp % (1U << 8)) << (9 - 8); - *(out + 7) += base; - if (length == 8) - goto bail; -bail: - return ((length * 9) + 7) / 8; -} - -static uint32_t -pack10_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { - uint32_t tmp, remaining; - if (length == 0) - return 0; - tmp = (*(in + 0) - base) << 0; - if (length == 1) - goto bail; - tmp |= (*(in + 1) - base) << 10; - if (length == 2) - goto bail; - tmp |= (*(in + 2) - base) << 20; - if (length == 3) - goto bail; - tmp |= (*(in + 3) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 3) - base) >> (10 - 8); - if (length == 4) - goto bail; - tmp |= (*(in + 4) - base) << 8; - if (length == 5) - goto bail; - tmp |= (*(in + 5) - base) << 18; - if (length == 6) - goto bail; - tmp |= (*(in + 6) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 6) - base) >> (10 - 6); - if (length == 7) - goto bail; - tmp |= (*(in + 7) - base) << 6; - if (length == 8) - goto bail; -bail: - remaining = (((length * 10) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; - memcpy(out, &tmp, remaining); - return ((length * 10) + 7) / 8; -} - -static uint32_t -unpack10_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { - uint32_t tmp; - if (length == 0) - return 0; - tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 1023); - if (length == 1) - goto bail; - *(out + 1) = base + ((tmp >> 10) & 1023); - if (length == 2) - goto bail; - *(out + 2) = base + ((tmp >> 20) & 1023); - if (length == 3) - goto bail; - *(out + 3) = tmp >> 30; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 3) |= (tmp % (1U << 8)) << (10 - 8); - *(out + 3) += base; - if (length == 4) - goto bail; - *(out + 4) = base + ((tmp >> 8) & 1023); - if (length == 5) - goto bail; - *(out + 5) = base + ((tmp >> 18) & 1023); - if (length == 6) - goto bail; - *(out + 6) = tmp >> 28; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 6) |= (tmp % (1U << 6)) << (10 - 6); - *(out + 6) += base; - if (length == 7) - goto bail; - *(out + 7) = base + ((tmp >> 6) & 1023); - if (length == 8) - goto bail; -bail: - return ((length * 10) + 7) / 8; -} - -static uint32_t -pack11_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { - uint32_t tmp, remaining; - if (length == 0) - return 0; - tmp = (*(in + 0) - base) << 0; - if (length == 1) - goto bail; - tmp |= (*(in + 1) - base) << 11; - if (length == 2) - goto bail; - tmp |= (*(in + 2) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 2) - base) >> (11 - 1); - if (length == 3) - goto bail; - tmp |= (*(in + 3) - base) << 1; - if (length == 4) - goto bail; - tmp |= (*(in + 4) - base) << 12; - if (length == 5) - goto bail; - tmp |= (*(in + 5) - base) << 23; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 5) - base) >> (11 - 2); - if (length == 6) - goto bail; - tmp |= (*(in + 6) - base) << 2; - if (length == 7) - goto bail; - tmp |= (*(in + 7) - base) << 13; - if (length == 8) - goto bail; -bail: - remaining = (((length * 11) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; - memcpy(out, &tmp, remaining); - return ((length * 11) + 7) / 8; -} - -static uint32_t -unpack11_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { - uint32_t tmp; - if (length == 0) - return 0; - tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 2047); - if (length == 1) - goto bail; - *(out + 1) = base + ((tmp >> 11) & 2047); - if (length == 2) - goto bail; - *(out + 2) = tmp >> 22; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 2) |= (tmp % (1U << 1)) << (11 - 1); - *(out + 2) += base; - if (length == 3) - goto bail; - *(out + 3) = base + ((tmp >> 1) & 2047); - if (length == 4) - goto bail; - *(out + 4) = base + ((tmp >> 12) & 2047); - if (length == 5) - goto bail; - *(out + 5) = tmp >> 23; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 5) |= (tmp % (1U << 2)) << (11 - 2); - *(out + 5) += base; - if (length == 6) - goto bail; - *(out + 6) = base + ((tmp >> 2) & 2047); - if (length == 7) - goto bail; - *(out + 7) = base + ((tmp >> 13) & 2047); - if (length == 8) - goto bail; -bail: - return ((length * 11) + 7) / 8; -} - -static uint32_t -pack12_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { - uint32_t tmp, remaining; - if (length == 0) - return 0; - tmp = (*(in + 0) - base) << 0; - if (length == 1) - goto bail; - tmp |= (*(in + 1) - base) << 12; - if (length == 2) - goto bail; - tmp |= (*(in + 2) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 2) - base) >> (12 - 4); - if (length == 3) - goto bail; - tmp |= (*(in + 3) - base) << 4; - if (length == 4) - goto bail; - tmp |= (*(in + 4) - base) << 16; - if (length == 5) - goto bail; - tmp |= (*(in + 5) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 5) - base) >> (12 - 8); - if (length == 6) - goto bail; - tmp |= (*(in + 6) - base) << 8; - if (length == 7) - goto bail; - tmp |= (*(in + 7) - base) << 20; - if (length == 8) - goto bail; -bail: - remaining = (((length * 12) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; - memcpy(out, &tmp, remaining); - return ((length * 12) + 7) / 8; -} - -static uint32_t -unpack12_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { - uint32_t tmp; - if (length == 0) - return 0; - tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 4095); - if (length == 1) - goto bail; - *(out + 1) = base + ((tmp >> 12) & 4095); - if (length == 2) - goto bail; - *(out + 2) = tmp >> 24; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 2) |= (tmp % (1U << 4)) << (12 - 4); - *(out + 2) += base; - if (length == 3) - goto bail; - *(out + 3) = base + ((tmp >> 4) & 4095); - if (length == 4) - goto bail; - *(out + 4) = base + ((tmp >> 16) & 4095); - if (length == 5) - goto bail; - *(out + 5) = tmp >> 28; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 5) |= (tmp % (1U << 8)) << (12 - 8); - *(out + 5) += base; - if (length == 6) - goto bail; - *(out + 6) = base + ((tmp >> 8) & 4095); - if (length == 7) - goto bail; - *(out + 7) = base + ((tmp >> 20) & 4095); - if (length == 8) - goto bail; -bail: - return ((length * 12) + 7) / 8; -} - -static uint32_t -pack13_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { - uint32_t tmp, remaining; - if (length == 0) - return 0; - tmp = (*(in + 0) - base) << 0; - if (length == 1) - goto bail; - tmp |= (*(in + 1) - base) << 13; - if (length == 2) - goto bail; - tmp |= (*(in + 2) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 2) - base) >> (13 - 7); - if (length == 3) - goto bail; - tmp |= (*(in + 3) - base) << 7; - if (length == 4) - goto bail; - tmp |= (*(in + 4) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 4) - base) >> (13 - 1); - if (length == 5) - goto bail; - tmp |= (*(in + 5) - base) << 1; - if (length == 6) - goto bail; - tmp |= (*(in + 6) - base) << 14; - if (length == 7) - goto bail; - tmp |= (*(in + 7) - base) << 27; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 7) - base) >> (13 - 8); - if (length == 8) - goto bail; -bail: - remaining = (((length * 13) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; - memcpy(out, &tmp, remaining); - return ((length * 13) + 7) / 8; -} - -static uint32_t -unpack13_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { - uint32_t tmp; - if (length == 0) - return 0; - tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 8191); - if (length == 1) - goto bail; - *(out + 1) = base + ((tmp >> 13) & 8191); - if (length == 2) - goto bail; - *(out + 2) = tmp >> 26; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 2) |= (tmp % (1U << 7)) << (13 - 7); - *(out + 2) += base; - if (length == 3) - goto bail; - *(out + 3) = base + ((tmp >> 7) & 8191); - if (length == 4) - goto bail; - *(out + 4) = tmp >> 20; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 4) |= (tmp % (1U << 1)) << (13 - 1); - *(out + 4) += base; - if (length == 5) - goto bail; - *(out + 5) = base + ((tmp >> 1) & 8191); - if (length == 6) - goto bail; - *(out + 6) = base + ((tmp >> 14) & 8191); - if (length == 7) - goto bail; - *(out + 7) = tmp >> 27; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 7) |= (tmp % (1U << 8)) << (13 - 8); - *(out + 7) += base; - if (length == 8) - goto bail; -bail: - return ((length * 13) + 7) / 8; -} - -static uint32_t -pack14_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { - uint32_t tmp, remaining; - if (length == 0) - return 0; - tmp = (*(in + 0) - base) << 0; - if (length == 1) - goto bail; - tmp |= (*(in + 1) - base) << 14; - if (length == 2) - goto bail; - tmp |= (*(in + 2) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 2) - base) >> (14 - 10); - if (length == 3) - goto bail; - tmp |= (*(in + 3) - base) << 10; - if (length == 4) - goto bail; - tmp |= (*(in + 4) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 4) - base) >> (14 - 6); - if (length == 5) - goto bail; - tmp |= (*(in + 5) - base) << 6; - if (length == 6) - goto bail; - tmp |= (*(in + 6) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 6) - base) >> (14 - 2); - if (length == 7) - goto bail; - tmp |= (*(in + 7) - base) << 2; - if (length == 8) - goto bail; -bail: - remaining = (((length * 14) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; - memcpy(out, &tmp, remaining); - return ((length * 14) + 7) / 8; -} - -static uint32_t -unpack14_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { - uint32_t tmp; - if (length == 0) - return 0; - tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 16383); - if (length == 1) - goto bail; - *(out + 1) = base + ((tmp >> 14) & 16383); - if (length == 2) - goto bail; - *(out + 2) = tmp >> 28; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 2) |= (tmp % (1U << 10)) << (14 - 10); - *(out + 2) += base; - if (length == 3) - goto bail; - *(out + 3) = base + ((tmp >> 10) & 16383); - if (length == 4) - goto bail; - *(out + 4) = tmp >> 24; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 4) |= (tmp % (1U << 6)) << (14 - 6); - *(out + 4) += base; - if (length == 5) - goto bail; - *(out + 5) = base + ((tmp >> 6) & 16383); - if (length == 6) - goto bail; - *(out + 6) = tmp >> 20; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 6) |= (tmp % (1U << 2)) << (14 - 2); - *(out + 6) += base; - if (length == 7) - goto bail; - *(out + 7) = base + ((tmp >> 2) & 16383); - if (length == 8) - goto bail; -bail: - return ((length * 14) + 7) / 8; -} - -static uint32_t -pack15_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { - uint32_t tmp, remaining; - if (length == 0) - return 0; - tmp = (*(in + 0) - base) << 0; - if (length == 1) - goto bail; - tmp |= (*(in + 1) - base) << 15; - if (length == 2) - goto bail; - tmp |= (*(in + 2) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 2) - base) >> (15 - 13); - if (length == 3) - goto bail; - tmp |= (*(in + 3) - base) << 13; - if (length == 4) - goto bail; - tmp |= (*(in + 4) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 4) - base) >> (15 - 11); - if (length == 5) - goto bail; - tmp |= (*(in + 5) - base) << 11; - if (length == 6) - goto bail; - tmp |= (*(in + 6) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 6) - base) >> (15 - 9); - if (length == 7) - goto bail; - tmp |= (*(in + 7) - base) << 9; - if (length == 8) - goto bail; -bail: - remaining = (((length * 15) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; - memcpy(out, &tmp, remaining); - return ((length * 15) + 7) / 8; -} - -static uint32_t -unpack15_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { - uint32_t tmp; - if (length == 0) - return 0; - tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 32767); - if (length == 1) - goto bail; - *(out + 1) = base + ((tmp >> 15) & 32767); - if (length == 2) - goto bail; - *(out + 2) = tmp >> 30; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 2) |= (tmp % (1U << 13)) << (15 - 13); - *(out + 2) += base; - if (length == 3) - goto bail; - *(out + 3) = base + ((tmp >> 13) & 32767); - if (length == 4) - goto bail; - *(out + 4) = tmp >> 28; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 4) |= (tmp % (1U << 11)) << (15 - 11); - *(out + 4) += base; - if (length == 5) - goto bail; - *(out + 5) = base + ((tmp >> 11) & 32767); - if (length == 6) - goto bail; - *(out + 6) = tmp >> 26; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 6) |= (tmp % (1U << 9)) << (15 - 9); - *(out + 6) += base; - if (length == 7) - goto bail; - *(out + 7) = base + ((tmp >> 9) & 32767); - if (length == 8) - goto bail; -bail: - return ((length * 15) + 7) / 8; -} - -static uint32_t -pack16_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { - uint32_t tmp, remaining; - if (length == 0) - return 0; - tmp = (*(in + 0) - base) << 0; - if (length == 1) - goto bail; - tmp |= (*(in + 1) - base) << 16; - if (length == 2) - goto bail; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 2) - base) << 0; - if (length == 3) - goto bail; - tmp |= (*(in + 3) - base) << 16; - if (length == 4) - goto bail; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 4) - base) << 0; - if (length == 5) - goto bail; - tmp |= (*(in + 5) - base) << 16; - if (length == 6) - goto bail; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 6) - base) << 0; - if (length == 7) - goto bail; - tmp |= (*(in + 7) - base) << 16; - if (length == 8) - goto bail; -bail: - remaining = (((length * 16) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; - memcpy(out, &tmp, remaining); - return ((length * 16) + 7) / 8; -} - -static uint32_t -unpack16_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { - uint32_t tmp; - if (length == 0) - return 0; - tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 65535); - if (length == 1) - goto bail; - *(out + 1) = base + ((tmp >> 16) & 65535); - if (length == 2) - goto bail; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 2) = base + ((tmp >> 0) & 65535); - if (length == 3) - goto bail; - *(out + 3) = base + ((tmp >> 16) & 65535); - if (length == 4) - goto bail; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 4) = base + ((tmp >> 0) & 65535); - if (length == 5) - goto bail; - *(out + 5) = base + ((tmp >> 16) & 65535); - if (length == 6) - goto bail; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 6) = base + ((tmp >> 0) & 65535); - if (length == 7) - goto bail; - *(out + 7) = base + ((tmp >> 16) & 65535); - if (length == 8) - goto bail; -bail: - return ((length * 16) + 7) / 8; -} - -static uint32_t -pack17_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { - uint32_t tmp, remaining; - if (length == 0) - return 0; - tmp = (*(in + 0) - base) << 0; - if (length == 1) - goto bail; - tmp |= (*(in + 1) - base) << 17; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 1) - base) >> (17 - 2); - if (length == 2) - goto bail; - tmp |= (*(in + 2) - base) << 2; - if (length == 3) - goto bail; - tmp |= (*(in + 3) - base) << 19; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 3) - base) >> (17 - 4); - if (length == 4) - goto bail; - tmp |= (*(in + 4) - base) << 4; - if (length == 5) - goto bail; - tmp |= (*(in + 5) - base) << 21; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 5) - base) >> (17 - 6); - if (length == 6) - goto bail; - tmp |= (*(in + 6) - base) << 6; - if (length == 7) - goto bail; - tmp |= (*(in + 7) - base) << 23; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 7) - base) >> (17 - 8); - if (length == 8) - goto bail; -bail: - remaining = (((length * 17) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; - memcpy(out, &tmp, remaining); - return ((length * 17) + 7) / 8; -} - -static uint32_t -unpack17_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { - uint32_t tmp; - if (length == 0) - return 0; - tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 131071); - if (length == 1) - goto bail; - *(out + 1) = tmp >> 17; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 1) |= (tmp % (1U << 2)) << (17 - 2); - *(out + 1) += base; - if (length == 2) - goto bail; - *(out + 2) = base + ((tmp >> 2) & 131071); - if (length == 3) - goto bail; - *(out + 3) = tmp >> 19; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 3) |= (tmp % (1U << 4)) << (17 - 4); - *(out + 3) += base; - if (length == 4) - goto bail; - *(out + 4) = base + ((tmp >> 4) & 131071); - if (length == 5) - goto bail; - *(out + 5) = tmp >> 21; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 5) |= (tmp % (1U << 6)) << (17 - 6); - *(out + 5) += base; - if (length == 6) - goto bail; - *(out + 6) = base + ((tmp >> 6) & 131071); - if (length == 7) - goto bail; - *(out + 7) = tmp >> 23; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 7) |= (tmp % (1U << 8)) << (17 - 8); - *(out + 7) += base; - if (length == 8) - goto bail; -bail: - return ((length * 17) + 7) / 8; -} - -static uint32_t -pack18_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { - uint32_t tmp, remaining; - if (length == 0) - return 0; - tmp = (*(in + 0) - base) << 0; - if (length == 1) - goto bail; - tmp |= (*(in + 1) - base) << 18; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 1) - base) >> (18 - 4); - if (length == 2) - goto bail; - tmp |= (*(in + 2) - base) << 4; - if (length == 3) - goto bail; - tmp |= (*(in + 3) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 3) - base) >> (18 - 8); - if (length == 4) - goto bail; - tmp |= (*(in + 4) - base) << 8; - if (length == 5) - goto bail; - tmp |= (*(in + 5) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 5) - base) >> (18 - 12); - if (length == 6) - goto bail; - tmp |= (*(in + 6) - base) << 12; - if (length == 7) - goto bail; - tmp |= (*(in + 7) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 7) - base) >> (18 - 16); - if (length == 8) - goto bail; -bail: - remaining = (((length * 18) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; - memcpy(out, &tmp, remaining); - return ((length * 18) + 7) / 8; -} - -static uint32_t -unpack18_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { - uint32_t tmp; - if (length == 0) - return 0; - tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 262143); - if (length == 1) - goto bail; - *(out + 1) = tmp >> 18; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 1) |= (tmp % (1U << 4)) << (18 - 4); - *(out + 1) += base; - if (length == 2) - goto bail; - *(out + 2) = base + ((tmp >> 4) & 262143); - if (length == 3) - goto bail; - *(out + 3) = tmp >> 22; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 3) |= (tmp % (1U << 8)) << (18 - 8); - *(out + 3) += base; - if (length == 4) - goto bail; - *(out + 4) = base + ((tmp >> 8) & 262143); - if (length == 5) - goto bail; - *(out + 5) = tmp >> 26; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 5) |= (tmp % (1U << 12)) << (18 - 12); - *(out + 5) += base; - if (length == 6) - goto bail; - *(out + 6) = base + ((tmp >> 12) & 262143); - if (length == 7) - goto bail; - *(out + 7) = tmp >> 30; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 7) |= (tmp % (1U << 16)) << (18 - 16); - *(out + 7) += base; - if (length == 8) - goto bail; -bail: - return ((length * 18) + 7) / 8; -} - -static uint32_t -pack19_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { - uint32_t tmp, remaining; - if (length == 0) - return 0; - tmp = (*(in + 0) - base) << 0; - if (length == 1) - goto bail; - tmp |= (*(in + 1) - base) << 19; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 1) - base) >> (19 - 6); - if (length == 2) - goto bail; - tmp |= (*(in + 2) - base) << 6; - if (length == 3) - goto bail; - tmp |= (*(in + 3) - base) << 25; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 3) - base) >> (19 - 12); - if (length == 4) - goto bail; - tmp |= (*(in + 4) - base) << 12; - if (length == 5) - goto bail; - tmp |= (*(in + 5) - base) << 31; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 5) - base) >> (19 - 18); - if (length == 6) - goto bail; - tmp |= (*(in + 6) - base) << 18; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 6) - base) >> (19 - 5); - if (length == 7) - goto bail; - tmp |= (*(in + 7) - base) << 5; - if (length == 8) - goto bail; -bail: - remaining = (((length * 19) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; - memcpy(out, &tmp, remaining); - return ((length * 19) + 7) / 8; -} - -static uint32_t -unpack19_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { - uint32_t tmp; - if (length == 0) - return 0; - tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 524287); - if (length == 1) - goto bail; - *(out + 1) = tmp >> 19; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 1) |= (tmp % (1U << 6)) << (19 - 6); - *(out + 1) += base; - if (length == 2) - goto bail; - *(out + 2) = base + ((tmp >> 6) & 524287); - if (length == 3) - goto bail; - *(out + 3) = tmp >> 25; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 3) |= (tmp % (1U << 12)) << (19 - 12); - *(out + 3) += base; - if (length == 4) - goto bail; - *(out + 4) = base + ((tmp >> 12) & 524287); - if (length == 5) - goto bail; - *(out + 5) = tmp >> 31; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 5) |= (tmp % (1U << 18)) << (19 - 18); - *(out + 5) += base; - if (length == 6) - goto bail; - *(out + 6) = tmp >> 18; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 6) |= (tmp % (1U << 5)) << (19 - 5); - *(out + 6) += base; - if (length == 7) - goto bail; - *(out + 7) = base + ((tmp >> 5) & 524287); - if (length == 8) - goto bail; -bail: - return ((length * 19) + 7) / 8; -} - -static uint32_t -pack20_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { - uint32_t tmp, remaining; - if (length == 0) - return 0; - tmp = (*(in + 0) - base) << 0; - if (length == 1) - goto bail; - tmp |= (*(in + 1) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 1) - base) >> (20 - 8); - if (length == 2) - goto bail; - tmp |= (*(in + 2) - base) << 8; - if (length == 3) - goto bail; - tmp |= (*(in + 3) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 3) - base) >> (20 - 16); - if (length == 4) - goto bail; - tmp |= (*(in + 4) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 4) - base) >> (20 - 4); - if (length == 5) - goto bail; - tmp |= (*(in + 5) - base) << 4; - if (length == 6) - goto bail; - tmp |= (*(in + 6) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 6) - base) >> (20 - 12); - if (length == 7) - goto bail; - tmp |= (*(in + 7) - base) << 12; - if (length == 8) - goto bail; -bail: - remaining = (((length * 20) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; - memcpy(out, &tmp, remaining); - return ((length * 20) + 7) / 8; -} - -static uint32_t -unpack20_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { - uint32_t tmp; - if (length == 0) - return 0; - tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 1048575); - if (length == 1) - goto bail; - *(out + 1) = tmp >> 20; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 1) |= (tmp % (1U << 8)) << (20 - 8); - *(out + 1) += base; - if (length == 2) - goto bail; - *(out + 2) = base + ((tmp >> 8) & 1048575); - if (length == 3) - goto bail; - *(out + 3) = tmp >> 28; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 3) |= (tmp % (1U << 16)) << (20 - 16); - *(out + 3) += base; - if (length == 4) - goto bail; - *(out + 4) = tmp >> 16; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 4) |= (tmp % (1U << 4)) << (20 - 4); - *(out + 4) += base; - if (length == 5) - goto bail; - *(out + 5) = base + ((tmp >> 4) & 1048575); - if (length == 6) - goto bail; - *(out + 6) = tmp >> 24; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 6) |= (tmp % (1U << 12)) << (20 - 12); - *(out + 6) += base; - if (length == 7) - goto bail; - *(out + 7) = base + ((tmp >> 12) & 1048575); - if (length == 8) - goto bail; -bail: - return ((length * 20) + 7) / 8; -} - -static uint32_t -pack21_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { - uint32_t tmp, remaining; - if (length == 0) - return 0; - tmp = (*(in + 0) - base) << 0; - if (length == 1) - goto bail; - tmp |= (*(in + 1) - base) << 21; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 1) - base) >> (21 - 10); - if (length == 2) - goto bail; - tmp |= (*(in + 2) - base) << 10; - if (length == 3) - goto bail; - tmp |= (*(in + 3) - base) << 31; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 3) - base) >> (21 - 20); - if (length == 4) - goto bail; - tmp |= (*(in + 4) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 4) - base) >> (21 - 9); - if (length == 5) - goto bail; - tmp |= (*(in + 5) - base) << 9; - if (length == 6) - goto bail; - tmp |= (*(in + 6) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 6) - base) >> (21 - 19); - if (length == 7) - goto bail; - tmp |= (*(in + 7) - base) << 19; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 7) - base) >> (21 - 8); - if (length == 8) - goto bail; -bail: - remaining = (((length * 21) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; - memcpy(out, &tmp, remaining); - return ((length * 21) + 7) / 8; -} - -static uint32_t -unpack21_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { - uint32_t tmp; - if (length == 0) - return 0; - tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 2097151); - if (length == 1) - goto bail; - *(out + 1) = tmp >> 21; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 1) |= (tmp % (1U << 10)) << (21 - 10); - *(out + 1) += base; - if (length == 2) - goto bail; - *(out + 2) = base + ((tmp >> 10) & 2097151); - if (length == 3) - goto bail; - *(out + 3) = tmp >> 31; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 3) |= (tmp % (1U << 20)) << (21 - 20); - *(out + 3) += base; - if (length == 4) - goto bail; - *(out + 4) = tmp >> 20; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 4) |= (tmp % (1U << 9)) << (21 - 9); - *(out + 4) += base; - if (length == 5) - goto bail; - *(out + 5) = base + ((tmp >> 9) & 2097151); - if (length == 6) - goto bail; - *(out + 6) = tmp >> 30; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 6) |= (tmp % (1U << 19)) << (21 - 19); - *(out + 6) += base; - if (length == 7) - goto bail; - *(out + 7) = tmp >> 19; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 7) |= (tmp % (1U << 8)) << (21 - 8); - *(out + 7) += base; - if (length == 8) - goto bail; -bail: - return ((length * 21) + 7) / 8; -} - -static uint32_t -pack22_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { - uint32_t tmp, remaining; - if (length == 0) - return 0; - tmp = (*(in + 0) - base) << 0; - if (length == 1) - goto bail; - tmp |= (*(in + 1) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 1) - base) >> (22 - 12); - if (length == 2) - goto bail; - tmp |= (*(in + 2) - base) << 12; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 2) - base) >> (22 - 2); - if (length == 3) - goto bail; - tmp |= (*(in + 3) - base) << 2; - if (length == 4) - goto bail; - tmp |= (*(in + 4) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 4) - base) >> (22 - 14); - if (length == 5) - goto bail; - tmp |= (*(in + 5) - base) << 14; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 5) - base) >> (22 - 4); - if (length == 6) - goto bail; - tmp |= (*(in + 6) - base) << 4; - if (length == 7) - goto bail; - tmp |= (*(in + 7) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 7) - base) >> (22 - 16); - if (length == 8) - goto bail; -bail: - remaining = (((length * 22) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; - memcpy(out, &tmp, remaining); - return ((length * 22) + 7) / 8; -} - -static uint32_t -unpack22_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { - uint32_t tmp; - if (length == 0) - return 0; - tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 4194303); - if (length == 1) - goto bail; - *(out + 1) = tmp >> 22; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 1) |= (tmp % (1U << 12)) << (22 - 12); - *(out + 1) += base; - if (length == 2) - goto bail; - *(out + 2) = tmp >> 12; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 2) |= (tmp % (1U << 2)) << (22 - 2); - *(out + 2) += base; - if (length == 3) - goto bail; - *(out + 3) = base + ((tmp >> 2) & 4194303); - if (length == 4) - goto bail; - *(out + 4) = tmp >> 24; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 4) |= (tmp % (1U << 14)) << (22 - 14); - *(out + 4) += base; - if (length == 5) - goto bail; - *(out + 5) = tmp >> 14; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 5) |= (tmp % (1U << 4)) << (22 - 4); - *(out + 5) += base; - if (length == 6) - goto bail; - *(out + 6) = base + ((tmp >> 4) & 4194303); - if (length == 7) - goto bail; - *(out + 7) = tmp >> 26; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 7) |= (tmp % (1U << 16)) << (22 - 16); - *(out + 7) += base; - if (length == 8) - goto bail; -bail: - return ((length * 22) + 7) / 8; -} - -static uint32_t -pack23_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { - uint32_t tmp, remaining; - if (length == 0) - return 0; - tmp = (*(in + 0) - base) << 0; - if (length == 1) - goto bail; - tmp |= (*(in + 1) - base) << 23; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 1) - base) >> (23 - 14); - if (length == 2) - goto bail; - tmp |= (*(in + 2) - base) << 14; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 2) - base) >> (23 - 5); - if (length == 3) - goto bail; - tmp |= (*(in + 3) - base) << 5; - if (length == 4) - goto bail; - tmp |= (*(in + 4) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 4) - base) >> (23 - 19); - if (length == 5) - goto bail; - tmp |= (*(in + 5) - base) << 19; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 5) - base) >> (23 - 10); - if (length == 6) - goto bail; - tmp |= (*(in + 6) - base) << 10; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 6) - base) >> (23 - 1); - if (length == 7) - goto bail; - tmp |= (*(in + 7) - base) << 1; - if (length == 8) - goto bail; -bail: - remaining = (((length * 23) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; - memcpy(out, &tmp, remaining); - return ((length * 23) + 7) / 8; -} - -static uint32_t -unpack23_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { - uint32_t tmp; - if (length == 0) - return 0; - tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 8388607); - if (length == 1) - goto bail; - *(out + 1) = tmp >> 23; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 1) |= (tmp % (1U << 14)) << (23 - 14); - *(out + 1) += base; - if (length == 2) - goto bail; - *(out + 2) = tmp >> 14; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 2) |= (tmp % (1U << 5)) << (23 - 5); - *(out + 2) += base; - if (length == 3) - goto bail; - *(out + 3) = base + ((tmp >> 5) & 8388607); - if (length == 4) - goto bail; - *(out + 4) = tmp >> 28; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 4) |= (tmp % (1U << 19)) << (23 - 19); - *(out + 4) += base; - if (length == 5) - goto bail; - *(out + 5) = tmp >> 19; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 5) |= (tmp % (1U << 10)) << (23 - 10); - *(out + 5) += base; - if (length == 6) - goto bail; - *(out + 6) = tmp >> 10; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 6) |= (tmp % (1U << 1)) << (23 - 1); - *(out + 6) += base; - if (length == 7) - goto bail; - *(out + 7) = base + ((tmp >> 1) & 8388607); - if (length == 8) - goto bail; -bail: - return ((length * 23) + 7) / 8; -} - -static uint32_t -pack24_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { - uint32_t tmp, remaining; - if (length == 0) - return 0; - tmp = (*(in + 0) - base) << 0; - if (length == 1) - goto bail; - tmp |= (*(in + 1) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 1) - base) >> (24 - 16); - if (length == 2) - goto bail; - tmp |= (*(in + 2) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 2) - base) >> (24 - 8); - if (length == 3) - goto bail; - tmp |= (*(in + 3) - base) << 8; - if (length == 4) - goto bail; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 4) - base) << 0; - if (length == 5) - goto bail; - tmp |= (*(in + 5) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 5) - base) >> (24 - 16); - if (length == 6) - goto bail; - tmp |= (*(in + 6) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 6) - base) >> (24 - 8); - if (length == 7) - goto bail; - tmp |= (*(in + 7) - base) << 8; - if (length == 8) - goto bail; -bail: - remaining = (((length * 24) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; - memcpy(out, &tmp, remaining); - return ((length * 24) + 7) / 8; -} - -static uint32_t -unpack24_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { - uint32_t tmp; - if (length == 0) - return 0; - tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 16777215); - if (length == 1) - goto bail; - *(out + 1) = tmp >> 24; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 1) |= (tmp % (1U << 16)) << (24 - 16); - *(out + 1) += base; - if (length == 2) - goto bail; - *(out + 2) = tmp >> 16; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 2) |= (tmp % (1U << 8)) << (24 - 8); - *(out + 2) += base; - if (length == 3) - goto bail; - *(out + 3) = base + ((tmp >> 8) & 16777215); - if (length == 4) - goto bail; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 4) = base + ((tmp >> 0) & 16777215); - if (length == 5) - goto bail; - *(out + 5) = tmp >> 24; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 5) |= (tmp % (1U << 16)) << (24 - 16); - *(out + 5) += base; - if (length == 6) - goto bail; - *(out + 6) = tmp >> 16; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 6) |= (tmp % (1U << 8)) << (24 - 8); - *(out + 6) += base; - if (length == 7) - goto bail; - *(out + 7) = base + ((tmp >> 8) & 16777215); - if (length == 8) - goto bail; -bail: - return ((length * 24) + 7) / 8; -} - -static uint32_t -pack25_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { - uint32_t tmp, remaining; - if (length == 0) - return 0; - tmp = (*(in + 0) - base) << 0; - if (length == 1) - goto bail; - tmp |= (*(in + 1) - base) << 25; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 1) - base) >> (25 - 18); - if (length == 2) - goto bail; - tmp |= (*(in + 2) - base) << 18; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 2) - base) >> (25 - 11); - if (length == 3) - goto bail; - tmp |= (*(in + 3) - base) << 11; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 3) - base) >> (25 - 4); - if (length == 4) - goto bail; - tmp |= (*(in + 4) - base) << 4; - if (length == 5) - goto bail; - tmp |= (*(in + 5) - base) << 29; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 5) - base) >> (25 - 22); - if (length == 6) - goto bail; - tmp |= (*(in + 6) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 6) - base) >> (25 - 15); - if (length == 7) - goto bail; - tmp |= (*(in + 7) - base) << 15; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 7) - base) >> (25 - 8); - if (length == 8) - goto bail; -bail: - remaining = (((length * 25) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; - memcpy(out, &tmp, remaining); - return ((length * 25) + 7) / 8; -} - -static uint32_t -unpack25_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { - uint32_t tmp; - if (length == 0) - return 0; - tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 33554431); - if (length == 1) - goto bail; - *(out + 1) = tmp >> 25; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 1) |= (tmp % (1U << 18)) << (25 - 18); - *(out + 1) += base; - if (length == 2) - goto bail; - *(out + 2) = tmp >> 18; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 2) |= (tmp % (1U << 11)) << (25 - 11); - *(out + 2) += base; - if (length == 3) - goto bail; - *(out + 3) = tmp >> 11; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 3) |= (tmp % (1U << 4)) << (25 - 4); - *(out + 3) += base; - if (length == 4) - goto bail; - *(out + 4) = base + ((tmp >> 4) & 33554431); - if (length == 5) - goto bail; - *(out + 5) = tmp >> 29; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 5) |= (tmp % (1U << 22)) << (25 - 22); - *(out + 5) += base; - if (length == 6) - goto bail; - *(out + 6) = tmp >> 22; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 6) |= (tmp % (1U << 15)) << (25 - 15); - *(out + 6) += base; - if (length == 7) - goto bail; - *(out + 7) = tmp >> 15; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 7) |= (tmp % (1U << 8)) << (25 - 8); - *(out + 7) += base; - if (length == 8) - goto bail; -bail: - return ((length * 25) + 7) / 8; -} - -static uint32_t -pack26_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { - uint32_t tmp, remaining; - if (length == 0) - return 0; - tmp = (*(in + 0) - base) << 0; - if (length == 1) - goto bail; - tmp |= (*(in + 1) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 1) - base) >> (26 - 20); - if (length == 2) - goto bail; - tmp |= (*(in + 2) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 2) - base) >> (26 - 14); - if (length == 3) - goto bail; - tmp |= (*(in + 3) - base) << 14; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 3) - base) >> (26 - 8); - if (length == 4) - goto bail; - tmp |= (*(in + 4) - base) << 8; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 4) - base) >> (26 - 2); - if (length == 5) - goto bail; - tmp |= (*(in + 5) - base) << 2; - if (length == 6) - goto bail; - tmp |= (*(in + 6) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 6) - base) >> (26 - 22); - if (length == 7) - goto bail; - tmp |= (*(in + 7) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 7) - base) >> (26 - 16); - if (length == 8) - goto bail; -bail: - remaining = (((length * 26) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; - memcpy(out, &tmp, remaining); - return ((length * 26) + 7) / 8; -} - -static uint32_t -unpack26_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { - uint32_t tmp; - if (length == 0) - return 0; - tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 67108863); - if (length == 1) - goto bail; - *(out + 1) = tmp >> 26; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 1) |= (tmp % (1U << 20)) << (26 - 20); - *(out + 1) += base; - if (length == 2) - goto bail; - *(out + 2) = tmp >> 20; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 2) |= (tmp % (1U << 14)) << (26 - 14); - *(out + 2) += base; - if (length == 3) - goto bail; - *(out + 3) = tmp >> 14; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 3) |= (tmp % (1U << 8)) << (26 - 8); - *(out + 3) += base; - if (length == 4) - goto bail; - *(out + 4) = tmp >> 8; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 4) |= (tmp % (1U << 2)) << (26 - 2); - *(out + 4) += base; - if (length == 5) - goto bail; - *(out + 5) = base + ((tmp >> 2) & 67108863); - if (length == 6) - goto bail; - *(out + 6) = tmp >> 28; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 6) |= (tmp % (1U << 22)) << (26 - 22); - *(out + 6) += base; - if (length == 7) - goto bail; - *(out + 7) = tmp >> 22; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 7) |= (tmp % (1U << 16)) << (26 - 16); - *(out + 7) += base; - if (length == 8) - goto bail; -bail: - return ((length * 26) + 7) / 8; -} - -static uint32_t -pack27_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { - uint32_t tmp, remaining; - if (length == 0) - return 0; - tmp = (*(in + 0) - base) << 0; - if (length == 1) - goto bail; - tmp |= (*(in + 1) - base) << 27; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 1) - base) >> (27 - 22); - if (length == 2) - goto bail; - tmp |= (*(in + 2) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 2) - base) >> (27 - 17); - if (length == 3) - goto bail; - tmp |= (*(in + 3) - base) << 17; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 3) - base) >> (27 - 12); - if (length == 4) - goto bail; - tmp |= (*(in + 4) - base) << 12; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 4) - base) >> (27 - 7); - if (length == 5) - goto bail; - tmp |= (*(in + 5) - base) << 7; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 5) - base) >> (27 - 2); - if (length == 6) - goto bail; - tmp |= (*(in + 6) - base) << 2; - if (length == 7) - goto bail; - tmp |= (*(in + 7) - base) << 29; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 7) - base) >> (27 - 24); - if (length == 8) - goto bail; -bail: - remaining = (((length * 27) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; - memcpy(out, &tmp, remaining); - return ((length * 27) + 7) / 8; -} - -static uint32_t -unpack27_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { - uint32_t tmp; - if (length == 0) - return 0; - tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 134217727); - if (length == 1) - goto bail; - *(out + 1) = tmp >> 27; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 1) |= (tmp % (1U << 22)) << (27 - 22); - *(out + 1) += base; - if (length == 2) - goto bail; - *(out + 2) = tmp >> 22; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 2) |= (tmp % (1U << 17)) << (27 - 17); - *(out + 2) += base; - if (length == 3) - goto bail; - *(out + 3) = tmp >> 17; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 3) |= (tmp % (1U << 12)) << (27 - 12); - *(out + 3) += base; - if (length == 4) - goto bail; - *(out + 4) = tmp >> 12; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 4) |= (tmp % (1U << 7)) << (27 - 7); - *(out + 4) += base; - if (length == 5) - goto bail; - *(out + 5) = tmp >> 7; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 5) |= (tmp % (1U << 2)) << (27 - 2); - *(out + 5) += base; - if (length == 6) - goto bail; - *(out + 6) = base + ((tmp >> 2) & 134217727); - if (length == 7) - goto bail; - *(out + 7) = tmp >> 29; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 7) |= (tmp % (1U << 24)) << (27 - 24); - *(out + 7) += base; - if (length == 8) - goto bail; -bail: - return ((length * 27) + 7) / 8; -} - -static uint32_t -pack28_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { - uint32_t tmp, remaining; - if (length == 0) - return 0; - tmp = (*(in + 0) - base) << 0; - if (length == 1) - goto bail; - tmp |= (*(in + 1) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 1) - base) >> (28 - 24); - if (length == 2) - goto bail; - tmp |= (*(in + 2) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 2) - base) >> (28 - 20); - if (length == 3) - goto bail; - tmp |= (*(in + 3) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 3) - base) >> (28 - 16); - if (length == 4) - goto bail; - tmp |= (*(in + 4) - base) << 16; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 4) - base) >> (28 - 12); - if (length == 5) - goto bail; - tmp |= (*(in + 5) - base) << 12; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 5) - base) >> (28 - 8); - if (length == 6) - goto bail; - tmp |= (*(in + 6) - base) << 8; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 6) - base) >> (28 - 4); - if (length == 7) - goto bail; - tmp |= (*(in + 7) - base) << 4; - if (length == 8) - goto bail; -bail: - remaining = (((length * 28) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; - memcpy(out, &tmp, remaining); - return ((length * 28) + 7) / 8; -} - -static uint32_t -unpack28_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { - uint32_t tmp; - if (length == 0) - return 0; - tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 268435455); - if (length == 1) - goto bail; - *(out + 1) = tmp >> 28; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 1) |= (tmp % (1U << 24)) << (28 - 24); - *(out + 1) += base; - if (length == 2) - goto bail; - *(out + 2) = tmp >> 24; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 2) |= (tmp % (1U << 20)) << (28 - 20); - *(out + 2) += base; - if (length == 3) - goto bail; - *(out + 3) = tmp >> 20; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 3) |= (tmp % (1U << 16)) << (28 - 16); - *(out + 3) += base; - if (length == 4) - goto bail; - *(out + 4) = tmp >> 16; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 4) |= (tmp % (1U << 12)) << (28 - 12); - *(out + 4) += base; - if (length == 5) - goto bail; - *(out + 5) = tmp >> 12; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 5) |= (tmp % (1U << 8)) << (28 - 8); - *(out + 5) += base; - if (length == 6) - goto bail; - *(out + 6) = tmp >> 8; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 6) |= (tmp % (1U << 4)) << (28 - 4); - *(out + 6) += base; - if (length == 7) - goto bail; - *(out + 7) = base + ((tmp >> 4) & 268435455); - if (length == 8) - goto bail; -bail: - return ((length * 28) + 7) / 8; -} - -static uint32_t -pack29_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { - uint32_t tmp, remaining; - if (length == 0) - return 0; - tmp = (*(in + 0) - base) << 0; - if (length == 1) - goto bail; - tmp |= (*(in + 1) - base) << 29; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 1) - base) >> (29 - 26); - if (length == 2) - goto bail; - tmp |= (*(in + 2) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 2) - base) >> (29 - 23); - if (length == 3) - goto bail; - tmp |= (*(in + 3) - base) << 23; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 3) - base) >> (29 - 20); - if (length == 4) - goto bail; - tmp |= (*(in + 4) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 4) - base) >> (29 - 17); - if (length == 5) - goto bail; - tmp |= (*(in + 5) - base) << 17; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 5) - base) >> (29 - 14); - if (length == 6) - goto bail; - tmp |= (*(in + 6) - base) << 14; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 6) - base) >> (29 - 11); - if (length == 7) - goto bail; - tmp |= (*(in + 7) - base) << 11; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 7) - base) >> (29 - 8); - if (length == 8) - goto bail; -bail: - remaining = (((length * 29) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; - memcpy(out, &tmp, remaining); - return ((length * 29) + 7) / 8; -} - -static uint32_t -unpack29_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { - uint32_t tmp; - if (length == 0) - return 0; - tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 536870911); - if (length == 1) - goto bail; - *(out + 1) = tmp >> 29; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 1) |= (tmp % (1U << 26)) << (29 - 26); - *(out + 1) += base; - if (length == 2) - goto bail; - *(out + 2) = tmp >> 26; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 2) |= (tmp % (1U << 23)) << (29 - 23); - *(out + 2) += base; - if (length == 3) - goto bail; - *(out + 3) = tmp >> 23; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 3) |= (tmp % (1U << 20)) << (29 - 20); - *(out + 3) += base; - if (length == 4) - goto bail; - *(out + 4) = tmp >> 20; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 4) |= (tmp % (1U << 17)) << (29 - 17); - *(out + 4) += base; - if (length == 5) - goto bail; - *(out + 5) = tmp >> 17; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 5) |= (tmp % (1U << 14)) << (29 - 14); - *(out + 5) += base; - if (length == 6) - goto bail; - *(out + 6) = tmp >> 14; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 6) |= (tmp % (1U << 11)) << (29 - 11); - *(out + 6) += base; - if (length == 7) - goto bail; - *(out + 7) = tmp >> 11; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 7) |= (tmp % (1U << 8)) << (29 - 8); - *(out + 7) += base; - if (length == 8) - goto bail; -bail: - return ((length * 29) + 7) / 8; -} - -static uint32_t -pack30_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { - uint32_t tmp, remaining; - if (length == 0) - return 0; - tmp = (*(in + 0) - base) << 0; - if (length == 1) - goto bail; - tmp |= (*(in + 1) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 1) - base) >> (30 - 28); - if (length == 2) - goto bail; - tmp |= (*(in + 2) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 2) - base) >> (30 - 26); - if (length == 3) - goto bail; - tmp |= (*(in + 3) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 3) - base) >> (30 - 24); - if (length == 4) - goto bail; - tmp |= (*(in + 4) - base) << 24; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 4) - base) >> (30 - 22); - if (length == 5) - goto bail; - tmp |= (*(in + 5) - base) << 22; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 5) - base) >> (30 - 20); - if (length == 6) - goto bail; - tmp |= (*(in + 6) - base) << 20; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 6) - base) >> (30 - 18); - if (length == 7) - goto bail; - tmp |= (*(in + 7) - base) << 18; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 7) - base) >> (30 - 16); - if (length == 8) - goto bail; -bail: - remaining = (((length * 30) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; - memcpy(out, &tmp, remaining); - return ((length * 30) + 7) / 8; -} - -static uint32_t -unpack30_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { - uint32_t tmp; - if (length == 0) - return 0; - tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 1073741823); - if (length == 1) - goto bail; - *(out + 1) = tmp >> 30; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 1) |= (tmp % (1U << 28)) << (30 - 28); - *(out + 1) += base; - if (length == 2) - goto bail; - *(out + 2) = tmp >> 28; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 2) |= (tmp % (1U << 26)) << (30 - 26); - *(out + 2) += base; - if (length == 3) - goto bail; - *(out + 3) = tmp >> 26; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 3) |= (tmp % (1U << 24)) << (30 - 24); - *(out + 3) += base; - if (length == 4) - goto bail; - *(out + 4) = tmp >> 24; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 4) |= (tmp % (1U << 22)) << (30 - 22); - *(out + 4) += base; - if (length == 5) - goto bail; - *(out + 5) = tmp >> 22; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 5) |= (tmp % (1U << 20)) << (30 - 20); - *(out + 5) += base; - if (length == 6) - goto bail; - *(out + 6) = tmp >> 20; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 6) |= (tmp % (1U << 18)) << (30 - 18); - *(out + 6) += base; - if (length == 7) - goto bail; - *(out + 7) = tmp >> 18; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 7) |= (tmp % (1U << 16)) << (30 - 16); - *(out + 7) += base; - if (length == 8) - goto bail; -bail: - return ((length * 30) + 7) / 8; -} - -static uint32_t -pack31_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { - uint32_t tmp, remaining; - if (length == 0) - return 0; - tmp = (*(in + 0) - base) << 0; - if (length == 1) - goto bail; - tmp |= (*(in + 1) - base) << 31; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 1) - base) >> (31 - 30); - if (length == 2) - goto bail; - tmp |= (*(in + 2) - base) << 30; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 2) - base) >> (31 - 29); - if (length == 3) - goto bail; - tmp |= (*(in + 3) - base) << 29; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 3) - base) >> (31 - 28); - if (length == 4) - goto bail; - tmp |= (*(in + 4) - base) << 28; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 4) - base) >> (31 - 27); - if (length == 5) - goto bail; - tmp |= (*(in + 5) - base) << 27; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 5) - base) >> (31 - 26); - if (length == 6) - goto bail; - tmp |= (*(in + 6) - base) << 26; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 6) - base) >> (31 - 25); - if (length == 7) - goto bail; - tmp |= (*(in + 7) - base) << 25; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 7) - base) >> (31 - 24); - if (length == 8) - goto bail; -bail: - remaining = (((length * 31) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; - memcpy(out, &tmp, remaining); - return ((length * 31) + 7) / 8; -} - -static uint32_t -unpack31_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { - uint32_t tmp; - if (length == 0) - return 0; - tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 2147483647); - if (length == 1) - goto bail; - *(out + 1) = tmp >> 31; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 1) |= (tmp % (1U << 30)) << (31 - 30); - *(out + 1) += base; - if (length == 2) - goto bail; - *(out + 2) = tmp >> 30; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 2) |= (tmp % (1U << 29)) << (31 - 29); - *(out + 2) += base; - if (length == 3) - goto bail; - *(out + 3) = tmp >> 29; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 3) |= (tmp % (1U << 28)) << (31 - 28); - *(out + 3) += base; - if (length == 4) - goto bail; - *(out + 4) = tmp >> 28; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 4) |= (tmp % (1U << 27)) << (31 - 27); - *(out + 4) += base; - if (length == 5) - goto bail; - *(out + 5) = tmp >> 27; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 5) |= (tmp % (1U << 26)) << (31 - 26); - *(out + 5) += base; - if (length == 6) - goto bail; - *(out + 6) = tmp >> 26; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 6) |= (tmp % (1U << 25)) << (31 - 25); - *(out + 6) += base; - if (length == 7) - goto bail; - *(out + 7) = tmp >> 25; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 7) |= (tmp % (1U << 24)) << (31 - 24); - *(out + 7) += base; - if (length == 8) - goto bail; -bail: - return ((length * 31) + 7) / 8; -} - -static uint32_t -pack32_x(uint32_t base, const uint32_t *in, uint8_t *out, uint32_t length) { - uint32_t tmp, remaining; - if (length == 0) - return 0; - tmp = (*(in + 0) - base) << 0; - if (length == 1) - goto bail; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 1) - base) << 0; - if (length == 2) - goto bail; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 2) - base) << 0; - if (length == 3) - goto bail; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 3) - base) << 0; - if (length == 4) - goto bail; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 4) - base) << 0; - if (length == 5) - goto bail; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 5) - base) << 0; - if (length == 6) - goto bail; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 6) - base) << 0; - if (length == 7) - goto bail; - *(uint32_t *)out = tmp; - out += sizeof(uint32_t); - tmp = (*(in + 7) - base) << 0; - if (length == 8) - goto bail; -bail: - remaining = (((length * 32) + 7) / 8) % 4; - if (remaining == 0) remaining = 4; - memcpy(out, &tmp, remaining); - return ((length * 32) + 7) / 8; -} - -static uint32_t -unpack32_x(uint32_t base, const uint8_t *in, uint32_t *out, uint32_t length) { - uint32_t tmp; - if (length == 0) - return 0; - tmp = *(uint32_t *)in; - *(out + 0) = base + ((tmp >> 0) & 4294967295); - if (length == 1) - goto bail; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 1) = base + ((tmp >> 0) & 4294967295); - if (length == 2) - goto bail; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 2) = base + ((tmp >> 0) & 4294967295); - if (length == 3) - goto bail; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 3) = base + ((tmp >> 0) & 4294967295); - if (length == 4) - goto bail; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 4) = base + ((tmp >> 0) & 4294967295); - if (length == 5) - goto bail; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 5) = base + ((tmp >> 0) & 4294967295); - if (length == 6) - goto bail; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 6) = base + ((tmp >> 0) & 4294967295); - if (length == 7) - goto bail; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - *(out + 7) = base + ((tmp >> 0) & 4294967295); - if (length == 8) - goto bail; -bail: - return ((length * 32) + 7) / 8; -} - -for_packxfunc_t for_packx[33] = { - pack0_x, - pack1_x, - pack2_x, - pack3_x, - pack4_x, - pack5_x, - pack6_x, - pack7_x, - pack8_x, - pack9_x, - pack10_x, - pack11_x, - pack12_x, - pack13_x, - pack14_x, - pack15_x, - pack16_x, - pack17_x, - pack18_x, - pack19_x, - pack20_x, - pack21_x, - pack22_x, - pack23_x, - pack24_x, - pack25_x, - pack26_x, - pack27_x, - pack28_x, - pack29_x, - pack30_x, - pack31_x, - pack32_x -}; - -for_unpackxfunc_t for_unpackx[33] = { - unpack0_x, - unpack1_x, - unpack2_x, - unpack3_x, - unpack4_x, - unpack5_x, - unpack6_x, - unpack7_x, - unpack8_x, - unpack9_x, - unpack10_x, - unpack11_x, - unpack12_x, - unpack13_x, - unpack14_x, - unpack15_x, - unpack16_x, - unpack17_x, - unpack18_x, - unpack19_x, - unpack20_x, - unpack21_x, - unpack22_x, - unpack23_x, - unpack24_x, - unpack25_x, - unpack26_x, - unpack27_x, - unpack28_x, - unpack29_x, - unpack30_x, - unpack31_x, - unpack32_x -}; - -static uint32_t -linsearch1_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 1) == value) { - *found = 0; - return 0; - } - if (((tmp >> 1) & 1) == value) { - *found = 1; - return 1; - } - if (((tmp >> 2) & 1) == value) { - *found = 2; - return 2; - } - if (((tmp >> 3) & 1) == value) { - *found = 3; - return 3; - } - if (((tmp >> 4) & 1) == value) { - *found = 4; - return 4; - } - if (((tmp >> 5) & 1) == value) { - *found = 5; - return 5; - } - if (((tmp >> 6) & 1) == value) { - *found = 6; - return 6; - } - if (((tmp >> 7) & 1) == value) { - *found = 7; - return 7; - } - if (((tmp >> 8) & 1) == value) { - *found = 8; - return 8; - } - if (((tmp >> 9) & 1) == value) { - *found = 9; - return 9; - } - if (((tmp >> 10) & 1) == value) { - *found = 10; - return 10; - } - if (((tmp >> 11) & 1) == value) { - *found = 11; - return 11; - } - if (((tmp >> 12) & 1) == value) { - *found = 12; - return 12; - } - if (((tmp >> 13) & 1) == value) { - *found = 13; - return 13; - } - if (((tmp >> 14) & 1) == value) { - *found = 14; - return 14; - } - if (((tmp >> 15) & 1) == value) { - *found = 15; - return 15; - } - if (((tmp >> 16) & 1) == value) { - *found = 16; - return 16; - } - if (((tmp >> 17) & 1) == value) { - *found = 17; - return 17; - } - if (((tmp >> 18) & 1) == value) { - *found = 18; - return 18; - } - if (((tmp >> 19) & 1) == value) { - *found = 19; - return 19; - } - if (((tmp >> 20) & 1) == value) { - *found = 20; - return 20; - } - if (((tmp >> 21) & 1) == value) { - *found = 21; - return 21; - } - if (((tmp >> 22) & 1) == value) { - *found = 22; - return 22; - } - if (((tmp >> 23) & 1) == value) { - *found = 23; - return 23; - } - if (((tmp >> 24) & 1) == value) { - *found = 24; - return 24; - } - if (((tmp >> 25) & 1) == value) { - *found = 25; - return 25; - } - if (((tmp >> 26) & 1) == value) { - *found = 26; - return 26; - } - if (((tmp >> 27) & 1) == value) { - *found = 27; - return 27; - } - if (((tmp >> 28) & 1) == value) { - *found = 28; - return 28; - } - if (((tmp >> 29) & 1) == value) { - *found = 29; - return 29; - } - if (((tmp >> 30) & 1) == value) { - *found = 30; - return 30; - } - if (((tmp >> 31) & 1) == value) { - *found = 31; - return 31; - } - /* remaining: 0 bits */ - return (4); -} - -static uint32_t -linsearch2_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 3) == value) { - *found = 0; - return 0; - } - if (((tmp >> 2) & 3) == value) { - *found = 1; - return 1; - } - if (((tmp >> 4) & 3) == value) { - *found = 2; - return 2; - } - if (((tmp >> 6) & 3) == value) { - *found = 3; - return 3; - } - if (((tmp >> 8) & 3) == value) { - *found = 4; - return 4; - } - if (((tmp >> 10) & 3) == value) { - *found = 5; - return 5; - } - if (((tmp >> 12) & 3) == value) { - *found = 6; - return 6; - } - if (((tmp >> 14) & 3) == value) { - *found = 7; - return 7; - } - if (((tmp >> 16) & 3) == value) { - *found = 8; - return 8; - } - if (((tmp >> 18) & 3) == value) { - *found = 9; - return 9; - } - if (((tmp >> 20) & 3) == value) { - *found = 10; - return 10; - } - if (((tmp >> 22) & 3) == value) { - *found = 11; - return 11; - } - if (((tmp >> 24) & 3) == value) { - *found = 12; - return 12; - } - if (((tmp >> 26) & 3) == value) { - *found = 13; - return 13; - } - if (((tmp >> 28) & 3) == value) { - *found = 14; - return 14; - } - if (((tmp >> 30) & 3) == value) { - *found = 15; - return 15; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 8) */ - if (((tmp >> 0) & 3) == value) { - *found = 16; - return 16; - } - if (((tmp >> 2) & 3) == value) { - *found = 17; - return 17; - } - if (((tmp >> 4) & 3) == value) { - *found = 18; - return 18; - } - if (((tmp >> 6) & 3) == value) { - *found = 19; - return 19; - } - if (((tmp >> 8) & 3) == value) { - *found = 20; - return 20; - } - if (((tmp >> 10) & 3) == value) { - *found = 21; - return 21; - } - if (((tmp >> 12) & 3) == value) { - *found = 22; - return 22; - } - if (((tmp >> 14) & 3) == value) { - *found = 23; - return 23; - } - if (((tmp >> 16) & 3) == value) { - *found = 24; - return 24; - } - if (((tmp >> 18) & 3) == value) { - *found = 25; - return 25; - } - if (((tmp >> 20) & 3) == value) { - *found = 26; - return 26; - } - if (((tmp >> 22) & 3) == value) { - *found = 27; - return 27; - } - if (((tmp >> 24) & 3) == value) { - *found = 28; - return 28; - } - if (((tmp >> 26) & 3) == value) { - *found = 29; - return 29; - } - if (((tmp >> 28) & 3) == value) { - *found = 30; - return 30; - } - if (((tmp >> 30) & 3) == value) { - *found = 31; - return 31; - } - /* remaining: 0 bits */ - return (8); -} - -static uint32_t -linsearch3_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 7) == value) { - *found = 0; - return 0; - } - if (((tmp >> 3) & 7) == value) { - *found = 1; - return 1; - } - if (((tmp >> 6) & 7) == value) { - *found = 2; - return 2; - } - if (((tmp >> 9) & 7) == value) { - *found = 3; - return 3; - } - if (((tmp >> 12) & 7) == value) { - *found = 4; - return 4; - } - if (((tmp >> 15) & 7) == value) { - *found = 5; - return 5; - } - if (((tmp >> 18) & 7) == value) { - *found = 6; - return 6; - } - if (((tmp >> 21) & 7) == value) { - *found = 7; - return 7; - } - if (((tmp >> 24) & 7) == value) { - *found = 8; - return 8; - } - if (((tmp >> 27) & 7) == value) { - *found = 9; - return 9; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 1)) << (3 - 1)) == value) { - *found = 10; - return 10; - } - if (((tmp >> 1) & 7) == value) { - *found = 11; - return 11; - } - if (((tmp >> 4) & 7) == value) { - *found = 12; - return 12; - } - if (((tmp >> 7) & 7) == value) { - *found = 13; - return 13; - } - if (((tmp >> 10) & 7) == value) { - *found = 14; - return 14; - } - if (((tmp >> 13) & 7) == value) { - *found = 15; - return 15; - } - if (((tmp >> 16) & 7) == value) { - *found = 16; - return 16; - } - if (((tmp >> 19) & 7) == value) { - *found = 17; - return 17; - } - if (((tmp >> 22) & 7) == value) { - *found = 18; - return 18; - } - if (((tmp >> 25) & 7) == value) { - *found = 19; - return 19; - } - if (((tmp >> 28) & 7) == value) { - *found = 20; - return 20; - } - tmp2 = tmp >> 31; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (3 - 2)) == value) { - *found = 21; - return 21; - } - if (((tmp >> 2) & 7) == value) { - *found = 22; - return 22; - } - if (((tmp >> 5) & 7) == value) { - *found = 23; - return 23; - } - if (((tmp >> 8) & 7) == value) { - *found = 24; - return 24; - } - if (((tmp >> 11) & 7) == value) { - *found = 25; - return 25; - } - if (((tmp >> 14) & 7) == value) { - *found = 26; - return 26; - } - if (((tmp >> 17) & 7) == value) { - *found = 27; - return 27; - } - if (((tmp >> 20) & 7) == value) { - *found = 28; - return 28; - } - if (((tmp >> 23) & 7) == value) { - *found = 29; - return 29; - } - if (((tmp >> 26) & 7) == value) { - *found = 30; - return 30; - } - if (((tmp >> 29) & 7) == value) { - *found = 31; - return 31; - } - /* remaining: 0 bits */ - return (12); -} - -static uint32_t -linsearch4_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 15) == value) { - *found = 0; - return 0; - } - if (((tmp >> 4) & 15) == value) { - *found = 1; - return 1; - } - if (((tmp >> 8) & 15) == value) { - *found = 2; - return 2; - } - if (((tmp >> 12) & 15) == value) { - *found = 3; - return 3; - } - if (((tmp >> 16) & 15) == value) { - *found = 4; - return 4; - } - if (((tmp >> 20) & 15) == value) { - *found = 5; - return 5; - } - if (((tmp >> 24) & 15) == value) { - *found = 6; - return 6; - } - if (((tmp >> 28) & 15) == value) { - *found = 7; - return 7; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 8) */ - if (((tmp >> 0) & 15) == value) { - *found = 8; - return 8; - } - if (((tmp >> 4) & 15) == value) { - *found = 9; - return 9; - } - if (((tmp >> 8) & 15) == value) { - *found = 10; - return 10; - } - if (((tmp >> 12) & 15) == value) { - *found = 11; - return 11; - } - if (((tmp >> 16) & 15) == value) { - *found = 12; - return 12; - } - if (((tmp >> 20) & 15) == value) { - *found = 13; - return 13; - } - if (((tmp >> 24) & 15) == value) { - *found = 14; - return 14; - } - if (((tmp >> 28) & 15) == value) { - *found = 15; - return 15; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 12) */ - if (((tmp >> 0) & 15) == value) { - *found = 16; - return 16; - } - if (((tmp >> 4) & 15) == value) { - *found = 17; - return 17; - } - if (((tmp >> 8) & 15) == value) { - *found = 18; - return 18; - } - if (((tmp >> 12) & 15) == value) { - *found = 19; - return 19; - } - if (((tmp >> 16) & 15) == value) { - *found = 20; - return 20; - } - if (((tmp >> 20) & 15) == value) { - *found = 21; - return 21; - } - if (((tmp >> 24) & 15) == value) { - *found = 22; - return 22; - } - if (((tmp >> 28) & 15) == value) { - *found = 23; - return 23; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 16) */ - if (((tmp >> 0) & 15) == value) { - *found = 24; - return 24; - } - if (((tmp >> 4) & 15) == value) { - *found = 25; - return 25; - } - if (((tmp >> 8) & 15) == value) { - *found = 26; - return 26; - } - if (((tmp >> 12) & 15) == value) { - *found = 27; - return 27; - } - if (((tmp >> 16) & 15) == value) { - *found = 28; - return 28; - } - if (((tmp >> 20) & 15) == value) { - *found = 29; - return 29; - } - if (((tmp >> 24) & 15) == value) { - *found = 30; - return 30; - } - if (((tmp >> 28) & 15) == value) { - *found = 31; - return 31; - } - /* remaining: 0 bits */ - return (16); -} - -static uint32_t -linsearch5_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 31) == value) { - *found = 0; - return 0; - } - if (((tmp >> 5) & 31) == value) { - *found = 1; - return 1; - } - if (((tmp >> 10) & 31) == value) { - *found = 2; - return 2; - } - if (((tmp >> 15) & 31) == value) { - *found = 3; - return 3; - } - if (((tmp >> 20) & 31) == value) { - *found = 4; - return 4; - } - if (((tmp >> 25) & 31) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 3)) << (5 - 3)) == value) { - *found = 6; - return 6; - } - if (((tmp >> 3) & 31) == value) { - *found = 7; - return 7; - } - if (((tmp >> 8) & 31) == value) { - *found = 8; - return 8; - } - if (((tmp >> 13) & 31) == value) { - *found = 9; - return 9; - } - if (((tmp >> 18) & 31) == value) { - *found = 10; - return 10; - } - if (((tmp >> 23) & 31) == value) { - *found = 11; - return 11; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 1)) << (5 - 1)) == value) { - *found = 12; - return 12; - } - if (((tmp >> 1) & 31) == value) { - *found = 13; - return 13; - } - if (((tmp >> 6) & 31) == value) { - *found = 14; - return 14; - } - if (((tmp >> 11) & 31) == value) { - *found = 15; - return 15; - } - if (((tmp >> 16) & 31) == value) { - *found = 16; - return 16; - } - if (((tmp >> 21) & 31) == value) { - *found = 17; - return 17; - } - if (((tmp >> 26) & 31) == value) { - *found = 18; - return 18; - } - tmp2 = tmp >> 31; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (5 - 4)) == value) { - *found = 19; - return 19; - } - if (((tmp >> 4) & 31) == value) { - *found = 20; - return 20; - } - if (((tmp >> 9) & 31) == value) { - *found = 21; - return 21; - } - if (((tmp >> 14) & 31) == value) { - *found = 22; - return 22; - } - if (((tmp >> 19) & 31) == value) { - *found = 23; - return 23; - } - if (((tmp >> 24) & 31) == value) { - *found = 24; - return 24; - } - tmp2 = tmp >> 29; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (5 - 2)) == value) { - *found = 25; - return 25; - } - if (((tmp >> 2) & 31) == value) { - *found = 26; - return 26; - } - if (((tmp >> 7) & 31) == value) { - *found = 27; - return 27; - } - if (((tmp >> 12) & 31) == value) { - *found = 28; - return 28; - } - if (((tmp >> 17) & 31) == value) { - *found = 29; - return 29; - } - if (((tmp >> 22) & 31) == value) { - *found = 30; - return 30; - } - if (((tmp >> 27) & 31) == value) { - *found = 31; - return 31; - } - /* remaining: 0 bits */ - return (20); -} - -static uint32_t -linsearch6_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 63) == value) { - *found = 0; - return 0; - } - if (((tmp >> 6) & 63) == value) { - *found = 1; - return 1; - } - if (((tmp >> 12) & 63) == value) { - *found = 2; - return 2; - } - if (((tmp >> 18) & 63) == value) { - *found = 3; - return 3; - } - if (((tmp >> 24) & 63) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (6 - 4)) == value) { - *found = 5; - return 5; - } - if (((tmp >> 4) & 63) == value) { - *found = 6; - return 6; - } - if (((tmp >> 10) & 63) == value) { - *found = 7; - return 7; - } - if (((tmp >> 16) & 63) == value) { - *found = 8; - return 8; - } - if (((tmp >> 22) & 63) == value) { - *found = 9; - return 9; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (6 - 2)) == value) { - *found = 10; - return 10; - } - if (((tmp >> 2) & 63) == value) { - *found = 11; - return 11; - } - if (((tmp >> 8) & 63) == value) { - *found = 12; - return 12; - } - if (((tmp >> 14) & 63) == value) { - *found = 13; - return 13; - } - if (((tmp >> 20) & 63) == value) { - *found = 14; - return 14; - } - if (((tmp >> 26) & 63) == value) { - *found = 15; - return 15; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 16) */ - if (((tmp >> 0) & 63) == value) { - *found = 16; - return 16; - } - if (((tmp >> 6) & 63) == value) { - *found = 17; - return 17; - } - if (((tmp >> 12) & 63) == value) { - *found = 18; - return 18; - } - if (((tmp >> 18) & 63) == value) { - *found = 19; - return 19; - } - if (((tmp >> 24) & 63) == value) { - *found = 20; - return 20; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (6 - 4)) == value) { - *found = 21; - return 21; - } - if (((tmp >> 4) & 63) == value) { - *found = 22; - return 22; - } - if (((tmp >> 10) & 63) == value) { - *found = 23; - return 23; - } - if (((tmp >> 16) & 63) == value) { - *found = 24; - return 24; - } - if (((tmp >> 22) & 63) == value) { - *found = 25; - return 25; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (6 - 2)) == value) { - *found = 26; - return 26; - } - if (((tmp >> 2) & 63) == value) { - *found = 27; - return 27; - } - if (((tmp >> 8) & 63) == value) { - *found = 28; - return 28; - } - if (((tmp >> 14) & 63) == value) { - *found = 29; - return 29; - } - if (((tmp >> 20) & 63) == value) { - *found = 30; - return 30; - } - if (((tmp >> 26) & 63) == value) { - *found = 31; - return 31; - } - /* remaining: 0 bits */ - return (24); -} - -static uint32_t -linsearch7_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 127) == value) { - *found = 0; - return 0; - } - if (((tmp >> 7) & 127) == value) { - *found = 1; - return 1; - } - if (((tmp >> 14) & 127) == value) { - *found = 2; - return 2; - } - if (((tmp >> 21) & 127) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 3)) << (7 - 3)) == value) { - *found = 4; - return 4; - } - if (((tmp >> 3) & 127) == value) { - *found = 5; - return 5; - } - if (((tmp >> 10) & 127) == value) { - *found = 6; - return 6; - } - if (((tmp >> 17) & 127) == value) { - *found = 7; - return 7; - } - if (((tmp >> 24) & 127) == value) { - *found = 8; - return 8; - } - tmp2 = tmp >> 31; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (7 - 6)) == value) { - *found = 9; - return 9; - } - if (((tmp >> 6) & 127) == value) { - *found = 10; - return 10; - } - if (((tmp >> 13) & 127) == value) { - *found = 11; - return 11; - } - if (((tmp >> 20) & 127) == value) { - *found = 12; - return 12; - } - tmp2 = tmp >> 27; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (7 - 2)) == value) { - *found = 13; - return 13; - } - if (((tmp >> 2) & 127) == value) { - *found = 14; - return 14; - } - if (((tmp >> 9) & 127) == value) { - *found = 15; - return 15; - } - if (((tmp >> 16) & 127) == value) { - *found = 16; - return 16; - } - if (((tmp >> 23) & 127) == value) { - *found = 17; - return 17; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 5)) << (7 - 5)) == value) { - *found = 18; - return 18; - } - if (((tmp >> 5) & 127) == value) { - *found = 19; - return 19; - } - if (((tmp >> 12) & 127) == value) { - *found = 20; - return 20; - } - if (((tmp >> 19) & 127) == value) { - *found = 21; - return 21; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 1)) << (7 - 1)) == value) { - *found = 22; - return 22; - } - if (((tmp >> 1) & 127) == value) { - *found = 23; - return 23; - } - if (((tmp >> 8) & 127) == value) { - *found = 24; - return 24; - } - if (((tmp >> 15) & 127) == value) { - *found = 25; - return 25; - } - if (((tmp >> 22) & 127) == value) { - *found = 26; - return 26; - } - tmp2 = tmp >> 29; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (7 - 4)) == value) { - *found = 27; - return 27; - } - if (((tmp >> 4) & 127) == value) { - *found = 28; - return 28; - } - if (((tmp >> 11) & 127) == value) { - *found = 29; - return 29; - } - if (((tmp >> 18) & 127) == value) { - *found = 30; - return 30; - } - if (((tmp >> 25) & 127) == value) { - *found = 31; - return 31; - } - /* remaining: 0 bits */ - return (28); -} - -static uint32_t -linsearch8_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 255) == value) { - *found = 0; - return 0; - } - if (((tmp >> 8) & 255) == value) { - *found = 1; - return 1; - } - if (((tmp >> 16) & 255) == value) { - *found = 2; - return 2; - } - if (((tmp >> 24) & 255) == value) { - *found = 3; - return 3; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 8) */ - if (((tmp >> 0) & 255) == value) { - *found = 4; - return 4; - } - if (((tmp >> 8) & 255) == value) { - *found = 5; - return 5; - } - if (((tmp >> 16) & 255) == value) { - *found = 6; - return 6; - } - if (((tmp >> 24) & 255) == value) { - *found = 7; - return 7; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 12) */ - if (((tmp >> 0) & 255) == value) { - *found = 8; - return 8; - } - if (((tmp >> 8) & 255) == value) { - *found = 9; - return 9; - } - if (((tmp >> 16) & 255) == value) { - *found = 10; - return 10; - } - if (((tmp >> 24) & 255) == value) { - *found = 11; - return 11; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 16) */ - if (((tmp >> 0) & 255) == value) { - *found = 12; - return 12; - } - if (((tmp >> 8) & 255) == value) { - *found = 13; - return 13; - } - if (((tmp >> 16) & 255) == value) { - *found = 14; - return 14; - } - if (((tmp >> 24) & 255) == value) { - *found = 15; - return 15; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 20) */ - if (((tmp >> 0) & 255) == value) { - *found = 16; - return 16; - } - if (((tmp >> 8) & 255) == value) { - *found = 17; - return 17; - } - if (((tmp >> 16) & 255) == value) { - *found = 18; - return 18; - } - if (((tmp >> 24) & 255) == value) { - *found = 19; - return 19; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 24) */ - if (((tmp >> 0) & 255) == value) { - *found = 20; - return 20; - } - if (((tmp >> 8) & 255) == value) { - *found = 21; - return 21; - } - if (((tmp >> 16) & 255) == value) { - *found = 22; - return 22; - } - if (((tmp >> 24) & 255) == value) { - *found = 23; - return 23; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 28) */ - if (((tmp >> 0) & 255) == value) { - *found = 24; - return 24; - } - if (((tmp >> 8) & 255) == value) { - *found = 25; - return 25; - } - if (((tmp >> 16) & 255) == value) { - *found = 26; - return 26; - } - if (((tmp >> 24) & 255) == value) { - *found = 27; - return 27; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 32) */ - if (((tmp >> 0) & 255) == value) { - *found = 28; - return 28; - } - if (((tmp >> 8) & 255) == value) { - *found = 29; - return 29; - } - if (((tmp >> 16) & 255) == value) { - *found = 30; - return 30; - } - if (((tmp >> 24) & 255) == value) { - *found = 31; - return 31; - } - /* remaining: 0 bits */ - return (32); -} - -static uint32_t -linsearch9_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 511) == value) { - *found = 0; - return 0; - } - if (((tmp >> 9) & 511) == value) { - *found = 1; - return 1; - } - if (((tmp >> 18) & 511) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 27; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (9 - 4)) == value) { - *found = 3; - return 3; - } - if (((tmp >> 4) & 511) == value) { - *found = 4; - return 4; - } - if (((tmp >> 13) & 511) == value) { - *found = 5; - return 5; - } - if (((tmp >> 22) & 511) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 31; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (9 - 8)) == value) { - *found = 7; - return 7; - } - if (((tmp >> 8) & 511) == value) { - *found = 8; - return 8; - } - if (((tmp >> 17) & 511) == value) { - *found = 9; - return 9; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 3)) << (9 - 3)) == value) { - *found = 10; - return 10; - } - if (((tmp >> 3) & 511) == value) { - *found = 11; - return 11; - } - if (((tmp >> 12) & 511) == value) { - *found = 12; - return 12; - } - if (((tmp >> 21) & 511) == value) { - *found = 13; - return 13; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 7)) << (9 - 7)) == value) { - *found = 14; - return 14; - } - if (((tmp >> 7) & 511) == value) { - *found = 15; - return 15; - } - if (((tmp >> 16) & 511) == value) { - *found = 16; - return 16; - } - tmp2 = tmp >> 25; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (9 - 2)) == value) { - *found = 17; - return 17; - } - if (((tmp >> 2) & 511) == value) { - *found = 18; - return 18; - } - if (((tmp >> 11) & 511) == value) { - *found = 19; - return 19; - } - if (((tmp >> 20) & 511) == value) { - *found = 20; - return 20; - } - tmp2 = tmp >> 29; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (9 - 6)) == value) { - *found = 21; - return 21; - } - if (((tmp >> 6) & 511) == value) { - *found = 22; - return 22; - } - if (((tmp >> 15) & 511) == value) { - *found = 23; - return 23; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 1)) << (9 - 1)) == value) { - *found = 24; - return 24; - } - if (((tmp >> 1) & 511) == value) { - *found = 25; - return 25; - } - if (((tmp >> 10) & 511) == value) { - *found = 26; - return 26; - } - if (((tmp >> 19) & 511) == value) { - *found = 27; - return 27; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 5)) << (9 - 5)) == value) { - *found = 28; - return 28; - } - if (((tmp >> 5) & 511) == value) { - *found = 29; - return 29; - } - if (((tmp >> 14) & 511) == value) { - *found = 30; - return 30; - } - if (((tmp >> 23) & 511) == value) { - *found = 31; - return 31; - } - /* remaining: 0 bits */ - return (36); -} - -static uint32_t -linsearch10_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 1023) == value) { - *found = 0; - return 0; - } - if (((tmp >> 10) & 1023) == value) { - *found = 1; - return 1; - } - if (((tmp >> 20) & 1023) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (10 - 8)) == value) { - *found = 3; - return 3; - } - if (((tmp >> 8) & 1023) == value) { - *found = 4; - return 4; - } - if (((tmp >> 18) & 1023) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (10 - 6)) == value) { - *found = 6; - return 6; - } - if (((tmp >> 6) & 1023) == value) { - *found = 7; - return 7; - } - if (((tmp >> 16) & 1023) == value) { - *found = 8; - return 8; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (10 - 4)) == value) { - *found = 9; - return 9; - } - if (((tmp >> 4) & 1023) == value) { - *found = 10; - return 10; - } - if (((tmp >> 14) & 1023) == value) { - *found = 11; - return 11; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (10 - 2)) == value) { - *found = 12; - return 12; - } - if (((tmp >> 2) & 1023) == value) { - *found = 13; - return 13; - } - if (((tmp >> 12) & 1023) == value) { - *found = 14; - return 14; - } - if (((tmp >> 22) & 1023) == value) { - *found = 15; - return 15; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 24) */ - if (((tmp >> 0) & 1023) == value) { - *found = 16; - return 16; - } - if (((tmp >> 10) & 1023) == value) { - *found = 17; - return 17; - } - if (((tmp >> 20) & 1023) == value) { - *found = 18; - return 18; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (10 - 8)) == value) { - *found = 19; - return 19; - } - if (((tmp >> 8) & 1023) == value) { - *found = 20; - return 20; - } - if (((tmp >> 18) & 1023) == value) { - *found = 21; - return 21; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (10 - 6)) == value) { - *found = 22; - return 22; - } - if (((tmp >> 6) & 1023) == value) { - *found = 23; - return 23; - } - if (((tmp >> 16) & 1023) == value) { - *found = 24; - return 24; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (10 - 4)) == value) { - *found = 25; - return 25; - } - if (((tmp >> 4) & 1023) == value) { - *found = 26; - return 26; - } - if (((tmp >> 14) & 1023) == value) { - *found = 27; - return 27; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (10 - 2)) == value) { - *found = 28; - return 28; - } - if (((tmp >> 2) & 1023) == value) { - *found = 29; - return 29; - } - if (((tmp >> 12) & 1023) == value) { - *found = 30; - return 30; - } - if (((tmp >> 22) & 1023) == value) { - *found = 31; - return 31; - } - /* remaining: 0 bits */ - return (40); -} - -static uint32_t -linsearch11_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 2047) == value) { - *found = 0; - return 0; - } - if (((tmp >> 11) & 2047) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 1)) << (11 - 1)) == value) { - *found = 2; - return 2; - } - if (((tmp >> 1) & 2047) == value) { - *found = 3; - return 3; - } - if (((tmp >> 12) & 2047) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 23; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (11 - 2)) == value) { - *found = 5; - return 5; - } - if (((tmp >> 2) & 2047) == value) { - *found = 6; - return 6; - } - if (((tmp >> 13) & 2047) == value) { - *found = 7; - return 7; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 3)) << (11 - 3)) == value) { - *found = 8; - return 8; - } - if (((tmp >> 3) & 2047) == value) { - *found = 9; - return 9; - } - if (((tmp >> 14) & 2047) == value) { - *found = 10; - return 10; - } - tmp2 = tmp >> 25; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (11 - 4)) == value) { - *found = 11; - return 11; - } - if (((tmp >> 4) & 2047) == value) { - *found = 12; - return 12; - } - if (((tmp >> 15) & 2047) == value) { - *found = 13; - return 13; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 5)) << (11 - 5)) == value) { - *found = 14; - return 14; - } - if (((tmp >> 5) & 2047) == value) { - *found = 15; - return 15; - } - if (((tmp >> 16) & 2047) == value) { - *found = 16; - return 16; - } - tmp2 = tmp >> 27; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (11 - 6)) == value) { - *found = 17; - return 17; - } - if (((tmp >> 6) & 2047) == value) { - *found = 18; - return 18; - } - if (((tmp >> 17) & 2047) == value) { - *found = 19; - return 19; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 7)) << (11 - 7)) == value) { - *found = 20; - return 20; - } - if (((tmp >> 7) & 2047) == value) { - *found = 21; - return 21; - } - if (((tmp >> 18) & 2047) == value) { - *found = 22; - return 22; - } - tmp2 = tmp >> 29; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (11 - 8)) == value) { - *found = 23; - return 23; - } - if (((tmp >> 8) & 2047) == value) { - *found = 24; - return 24; - } - if (((tmp >> 19) & 2047) == value) { - *found = 25; - return 25; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 9)) << (11 - 9)) == value) { - *found = 26; - return 26; - } - if (((tmp >> 9) & 2047) == value) { - *found = 27; - return 27; - } - if (((tmp >> 20) & 2047) == value) { - *found = 28; - return 28; - } - tmp2 = tmp >> 31; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (11 - 10)) == value) { - *found = 29; - return 29; - } - if (((tmp >> 10) & 2047) == value) { - *found = 30; - return 30; - } - if (((tmp >> 21) & 2047) == value) { - *found = 31; - return 31; - } - /* remaining: 0 bits */ - return (44); -} - -static uint32_t -linsearch12_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 4095) == value) { - *found = 0; - return 0; - } - if (((tmp >> 12) & 4095) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (12 - 4)) == value) { - *found = 2; - return 2; - } - if (((tmp >> 4) & 4095) == value) { - *found = 3; - return 3; - } - if (((tmp >> 16) & 4095) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (12 - 8)) == value) { - *found = 5; - return 5; - } - if (((tmp >> 8) & 4095) == value) { - *found = 6; - return 6; - } - if (((tmp >> 20) & 4095) == value) { - *found = 7; - return 7; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 16) */ - if (((tmp >> 0) & 4095) == value) { - *found = 8; - return 8; - } - if (((tmp >> 12) & 4095) == value) { - *found = 9; - return 9; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (12 - 4)) == value) { - *found = 10; - return 10; - } - if (((tmp >> 4) & 4095) == value) { - *found = 11; - return 11; - } - if (((tmp >> 16) & 4095) == value) { - *found = 12; - return 12; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (12 - 8)) == value) { - *found = 13; - return 13; - } - if (((tmp >> 8) & 4095) == value) { - *found = 14; - return 14; - } - if (((tmp >> 20) & 4095) == value) { - *found = 15; - return 15; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 28) */ - if (((tmp >> 0) & 4095) == value) { - *found = 16; - return 16; - } - if (((tmp >> 12) & 4095) == value) { - *found = 17; - return 17; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (12 - 4)) == value) { - *found = 18; - return 18; - } - if (((tmp >> 4) & 4095) == value) { - *found = 19; - return 19; - } - if (((tmp >> 16) & 4095) == value) { - *found = 20; - return 20; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (12 - 8)) == value) { - *found = 21; - return 21; - } - if (((tmp >> 8) & 4095) == value) { - *found = 22; - return 22; - } - if (((tmp >> 20) & 4095) == value) { - *found = 23; - return 23; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 40) */ - if (((tmp >> 0) & 4095) == value) { - *found = 24; - return 24; - } - if (((tmp >> 12) & 4095) == value) { - *found = 25; - return 25; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (12 - 4)) == value) { - *found = 26; - return 26; - } - if (((tmp >> 4) & 4095) == value) { - *found = 27; - return 27; - } - if (((tmp >> 16) & 4095) == value) { - *found = 28; - return 28; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (12 - 8)) == value) { - *found = 29; - return 29; - } - if (((tmp >> 8) & 4095) == value) { - *found = 30; - return 30; - } - if (((tmp >> 20) & 4095) == value) { - *found = 31; - return 31; - } - /* remaining: 0 bits */ - return (48); -} - -static uint32_t -linsearch13_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 8191) == value) { - *found = 0; - return 0; - } - if (((tmp >> 13) & 8191) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 7)) << (13 - 7)) == value) { - *found = 2; - return 2; - } - if (((tmp >> 7) & 8191) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 1)) << (13 - 1)) == value) { - *found = 4; - return 4; - } - if (((tmp >> 1) & 8191) == value) { - *found = 5; - return 5; - } - if (((tmp >> 14) & 8191) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 27; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (13 - 8)) == value) { - *found = 7; - return 7; - } - if (((tmp >> 8) & 8191) == value) { - *found = 8; - return 8; - } - tmp2 = tmp >> 21; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (13 - 2)) == value) { - *found = 9; - return 9; - } - if (((tmp >> 2) & 8191) == value) { - *found = 10; - return 10; - } - if (((tmp >> 15) & 8191) == value) { - *found = 11; - return 11; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 9)) << (13 - 9)) == value) { - *found = 12; - return 12; - } - if (((tmp >> 9) & 8191) == value) { - *found = 13; - return 13; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 3)) << (13 - 3)) == value) { - *found = 14; - return 14; - } - if (((tmp >> 3) & 8191) == value) { - *found = 15; - return 15; - } - if (((tmp >> 16) & 8191) == value) { - *found = 16; - return 16; - } - tmp2 = tmp >> 29; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (13 - 10)) == value) { - *found = 17; - return 17; - } - if (((tmp >> 10) & 8191) == value) { - *found = 18; - return 18; - } - tmp2 = tmp >> 23; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (13 - 4)) == value) { - *found = 19; - return 19; - } - if (((tmp >> 4) & 8191) == value) { - *found = 20; - return 20; - } - if (((tmp >> 17) & 8191) == value) { - *found = 21; - return 21; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 11)) << (13 - 11)) == value) { - *found = 22; - return 22; - } - if (((tmp >> 11) & 8191) == value) { - *found = 23; - return 23; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 5)) << (13 - 5)) == value) { - *found = 24; - return 24; - } - if (((tmp >> 5) & 8191) == value) { - *found = 25; - return 25; - } - if (((tmp >> 18) & 8191) == value) { - *found = 26; - return 26; - } - tmp2 = tmp >> 31; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (13 - 12)) == value) { - *found = 27; - return 27; - } - if (((tmp >> 12) & 8191) == value) { - *found = 28; - return 28; - } - tmp2 = tmp >> 25; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (13 - 6)) == value) { - *found = 29; - return 29; - } - if (((tmp >> 6) & 8191) == value) { - *found = 30; - return 30; - } - if (((tmp >> 19) & 8191) == value) { - *found = 31; - return 31; - } - /* remaining: 0 bits */ - return (52); -} - -static uint32_t -linsearch14_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 16383) == value) { - *found = 0; - return 0; - } - if (((tmp >> 14) & 16383) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (14 - 10)) == value) { - *found = 2; - return 2; - } - if (((tmp >> 10) & 16383) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (14 - 6)) == value) { - *found = 4; - return 4; - } - if (((tmp >> 6) & 16383) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (14 - 2)) == value) { - *found = 6; - return 6; - } - if (((tmp >> 2) & 16383) == value) { - *found = 7; - return 7; - } - if (((tmp >> 16) & 16383) == value) { - *found = 8; - return 8; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (14 - 12)) == value) { - *found = 9; - return 9; - } - if (((tmp >> 12) & 16383) == value) { - *found = 10; - return 10; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (14 - 8)) == value) { - *found = 11; - return 11; - } - if (((tmp >> 8) & 16383) == value) { - *found = 12; - return 12; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (14 - 4)) == value) { - *found = 13; - return 13; - } - if (((tmp >> 4) & 16383) == value) { - *found = 14; - return 14; - } - if (((tmp >> 18) & 16383) == value) { - *found = 15; - return 15; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 32) */ - if (((tmp >> 0) & 16383) == value) { - *found = 16; - return 16; - } - if (((tmp >> 14) & 16383) == value) { - *found = 17; - return 17; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (14 - 10)) == value) { - *found = 18; - return 18; - } - if (((tmp >> 10) & 16383) == value) { - *found = 19; - return 19; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (14 - 6)) == value) { - *found = 20; - return 20; - } - if (((tmp >> 6) & 16383) == value) { - *found = 21; - return 21; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (14 - 2)) == value) { - *found = 22; - return 22; - } - if (((tmp >> 2) & 16383) == value) { - *found = 23; - return 23; - } - if (((tmp >> 16) & 16383) == value) { - *found = 24; - return 24; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (14 - 12)) == value) { - *found = 25; - return 25; - } - if (((tmp >> 12) & 16383) == value) { - *found = 26; - return 26; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (14 - 8)) == value) { - *found = 27; - return 27; - } - if (((tmp >> 8) & 16383) == value) { - *found = 28; - return 28; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (14 - 4)) == value) { - *found = 29; - return 29; - } - if (((tmp >> 4) & 16383) == value) { - *found = 30; - return 30; - } - if (((tmp >> 18) & 16383) == value) { - *found = 31; - return 31; - } - /* remaining: 0 bits */ - return (56); -} - -static uint32_t -linsearch15_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 32767) == value) { - *found = 0; - return 0; - } - if (((tmp >> 15) & 32767) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 13)) << (15 - 13)) == value) { - *found = 2; - return 2; - } - if (((tmp >> 13) & 32767) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 11)) << (15 - 11)) == value) { - *found = 4; - return 4; - } - if (((tmp >> 11) & 32767) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 9)) << (15 - 9)) == value) { - *found = 6; - return 6; - } - if (((tmp >> 9) & 32767) == value) { - *found = 7; - return 7; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 7)) << (15 - 7)) == value) { - *found = 8; - return 8; - } - if (((tmp >> 7) & 32767) == value) { - *found = 9; - return 9; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 5)) << (15 - 5)) == value) { - *found = 10; - return 10; - } - if (((tmp >> 5) & 32767) == value) { - *found = 11; - return 11; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 3)) << (15 - 3)) == value) { - *found = 12; - return 12; - } - if (((tmp >> 3) & 32767) == value) { - *found = 13; - return 13; - } - tmp2 = tmp >> 18; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 1)) << (15 - 1)) == value) { - *found = 14; - return 14; - } - if (((tmp >> 1) & 32767) == value) { - *found = 15; - return 15; - } - if (((tmp >> 16) & 32767) == value) { - *found = 16; - return 16; - } - tmp2 = tmp >> 31; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 14)) << (15 - 14)) == value) { - *found = 17; - return 17; - } - if (((tmp >> 14) & 32767) == value) { - *found = 18; - return 18; - } - tmp2 = tmp >> 29; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (15 - 12)) == value) { - *found = 19; - return 19; - } - if (((tmp >> 12) & 32767) == value) { - *found = 20; - return 20; - } - tmp2 = tmp >> 27; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (15 - 10)) == value) { - *found = 21; - return 21; - } - if (((tmp >> 10) & 32767) == value) { - *found = 22; - return 22; - } - tmp2 = tmp >> 25; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (15 - 8)) == value) { - *found = 23; - return 23; - } - if (((tmp >> 8) & 32767) == value) { - *found = 24; - return 24; - } - tmp2 = tmp >> 23; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (15 - 6)) == value) { - *found = 25; - return 25; - } - if (((tmp >> 6) & 32767) == value) { - *found = 26; - return 26; - } - tmp2 = tmp >> 21; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (15 - 4)) == value) { - *found = 27; - return 27; - } - if (((tmp >> 4) & 32767) == value) { - *found = 28; - return 28; - } - tmp2 = tmp >> 19; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 60) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (15 - 2)) == value) { - *found = 29; - return 29; - } - if (((tmp >> 2) & 32767) == value) { - *found = 30; - return 30; - } - if (((tmp >> 17) & 32767) == value) { - *found = 31; - return 31; - } - /* remaining: 0 bits */ - return (60); -} - -static uint32_t -linsearch16_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 65535) == value) { - *found = 0; - return 0; - } - if (((tmp >> 16) & 65535) == value) { - *found = 1; - return 1; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 8) */ - if (((tmp >> 0) & 65535) == value) { - *found = 2; - return 2; - } - if (((tmp >> 16) & 65535) == value) { - *found = 3; - return 3; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 12) */ - if (((tmp >> 0) & 65535) == value) { - *found = 4; - return 4; - } - if (((tmp >> 16) & 65535) == value) { - *found = 5; - return 5; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 16) */ - if (((tmp >> 0) & 65535) == value) { - *found = 6; - return 6; - } - if (((tmp >> 16) & 65535) == value) { - *found = 7; - return 7; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 20) */ - if (((tmp >> 0) & 65535) == value) { - *found = 8; - return 8; - } - if (((tmp >> 16) & 65535) == value) { - *found = 9; - return 9; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 24) */ - if (((tmp >> 0) & 65535) == value) { - *found = 10; - return 10; - } - if (((tmp >> 16) & 65535) == value) { - *found = 11; - return 11; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 28) */ - if (((tmp >> 0) & 65535) == value) { - *found = 12; - return 12; - } - if (((tmp >> 16) & 65535) == value) { - *found = 13; - return 13; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 32) */ - if (((tmp >> 0) & 65535) == value) { - *found = 14; - return 14; - } - if (((tmp >> 16) & 65535) == value) { - *found = 15; - return 15; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 36) */ - if (((tmp >> 0) & 65535) == value) { - *found = 16; - return 16; - } - if (((tmp >> 16) & 65535) == value) { - *found = 17; - return 17; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 40) */ - if (((tmp >> 0) & 65535) == value) { - *found = 18; - return 18; - } - if (((tmp >> 16) & 65535) == value) { - *found = 19; - return 19; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 44) */ - if (((tmp >> 0) & 65535) == value) { - *found = 20; - return 20; - } - if (((tmp >> 16) & 65535) == value) { - *found = 21; - return 21; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 48) */ - if (((tmp >> 0) & 65535) == value) { - *found = 22; - return 22; - } - if (((tmp >> 16) & 65535) == value) { - *found = 23; - return 23; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 52) */ - if (((tmp >> 0) & 65535) == value) { - *found = 24; - return 24; - } - if (((tmp >> 16) & 65535) == value) { - *found = 25; - return 25; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 56) */ - if (((tmp >> 0) & 65535) == value) { - *found = 26; - return 26; - } - if (((tmp >> 16) & 65535) == value) { - *found = 27; - return 27; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 60) */ - if (((tmp >> 0) & 65535) == value) { - *found = 28; - return 28; - } - if (((tmp >> 16) & 65535) == value) { - *found = 29; - return 29; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 64) */ - if (((tmp >> 0) & 65535) == value) { - *found = 30; - return 30; - } - if (((tmp >> 16) & 65535) == value) { - *found = 31; - return 31; - } - /* remaining: 0 bits */ - return (64); -} - -static uint32_t -linsearch17_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 131071) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 17; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (17 - 2)) == value) { - *found = 1; - return 1; - } - if (((tmp >> 2) & 131071) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 19; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (17 - 4)) == value) { - *found = 3; - return 3; - } - if (((tmp >> 4) & 131071) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 21; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (17 - 6)) == value) { - *found = 5; - return 5; - } - if (((tmp >> 6) & 131071) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 23; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (17 - 8)) == value) { - *found = 7; - return 7; - } - if (((tmp >> 8) & 131071) == value) { - *found = 8; - return 8; - } - tmp2 = tmp >> 25; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (17 - 10)) == value) { - *found = 9; - return 9; - } - if (((tmp >> 10) & 131071) == value) { - *found = 10; - return 10; - } - tmp2 = tmp >> 27; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (17 - 12)) == value) { - *found = 11; - return 11; - } - if (((tmp >> 12) & 131071) == value) { - *found = 12; - return 12; - } - tmp2 = tmp >> 29; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 14)) << (17 - 14)) == value) { - *found = 13; - return 13; - } - if (((tmp >> 14) & 131071) == value) { - *found = 14; - return 14; - } - tmp2 = tmp >> 31; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (17 - 16)) == value) { - *found = 15; - return 15; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 1)) << (17 - 1)) == value) { - *found = 16; - return 16; - } - if (((tmp >> 1) & 131071) == value) { - *found = 17; - return 17; - } - tmp2 = tmp >> 18; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 3)) << (17 - 3)) == value) { - *found = 18; - return 18; - } - if (((tmp >> 3) & 131071) == value) { - *found = 19; - return 19; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 5)) << (17 - 5)) == value) { - *found = 20; - return 20; - } - if (((tmp >> 5) & 131071) == value) { - *found = 21; - return 21; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 7)) << (17 - 7)) == value) { - *found = 22; - return 22; - } - if (((tmp >> 7) & 131071) == value) { - *found = 23; - return 23; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 9)) << (17 - 9)) == value) { - *found = 24; - return 24; - } - if (((tmp >> 9) & 131071) == value) { - *found = 25; - return 25; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 60) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 11)) << (17 - 11)) == value) { - *found = 26; - return 26; - } - if (((tmp >> 11) & 131071) == value) { - *found = 27; - return 27; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 64) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 13)) << (17 - 13)) == value) { - *found = 28; - return 28; - } - if (((tmp >> 13) & 131071) == value) { - *found = 29; - return 29; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 68) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 15)) << (17 - 15)) == value) { - *found = 30; - return 30; - } - if (((tmp >> 15) & 131071) == value) { - *found = 31; - return 31; - } - /* remaining: 0 bits */ - return (68); -} - -static uint32_t -linsearch18_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 262143) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 18; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (18 - 4)) == value) { - *found = 1; - return 1; - } - if (((tmp >> 4) & 262143) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (18 - 8)) == value) { - *found = 3; - return 3; - } - if (((tmp >> 8) & 262143) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (18 - 12)) == value) { - *found = 5; - return 5; - } - if (((tmp >> 12) & 262143) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (18 - 16)) == value) { - *found = 7; - return 7; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (18 - 2)) == value) { - *found = 8; - return 8; - } - if (((tmp >> 2) & 262143) == value) { - *found = 9; - return 9; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (18 - 6)) == value) { - *found = 10; - return 10; - } - if (((tmp >> 6) & 262143) == value) { - *found = 11; - return 11; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (18 - 10)) == value) { - *found = 12; - return 12; - } - if (((tmp >> 10) & 262143) == value) { - *found = 13; - return 13; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 14)) << (18 - 14)) == value) { - *found = 14; - return 14; - } - if (((tmp >> 14) & 262143) == value) { - *found = 15; - return 15; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 40) */ - if (((tmp >> 0) & 262143) == value) { - *found = 16; - return 16; - } - tmp2 = tmp >> 18; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (18 - 4)) == value) { - *found = 17; - return 17; - } - if (((tmp >> 4) & 262143) == value) { - *found = 18; - return 18; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (18 - 8)) == value) { - *found = 19; - return 19; - } - if (((tmp >> 8) & 262143) == value) { - *found = 20; - return 20; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (18 - 12)) == value) { - *found = 21; - return 21; - } - if (((tmp >> 12) & 262143) == value) { - *found = 22; - return 22; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (18 - 16)) == value) { - *found = 23; - return 23; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 60) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (18 - 2)) == value) { - *found = 24; - return 24; - } - if (((tmp >> 2) & 262143) == value) { - *found = 25; - return 25; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 64) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (18 - 6)) == value) { - *found = 26; - return 26; - } - if (((tmp >> 6) & 262143) == value) { - *found = 27; - return 27; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 68) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (18 - 10)) == value) { - *found = 28; - return 28; - } - if (((tmp >> 10) & 262143) == value) { - *found = 29; - return 29; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 72) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 14)) << (18 - 14)) == value) { - *found = 30; - return 30; - } - if (((tmp >> 14) & 262143) == value) { - *found = 31; - return 31; - } - /* remaining: 0 bits */ - return (72); -} - -static uint32_t -linsearch19_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 524287) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 19; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (19 - 6)) == value) { - *found = 1; - return 1; - } - if (((tmp >> 6) & 524287) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 25; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (19 - 12)) == value) { - *found = 3; - return 3; - } - if (((tmp >> 12) & 524287) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 31; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 18)) << (19 - 18)) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 18; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 5)) << (19 - 5)) == value) { - *found = 6; - return 6; - } - if (((tmp >> 5) & 524287) == value) { - *found = 7; - return 7; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 11)) << (19 - 11)) == value) { - *found = 8; - return 8; - } - if (((tmp >> 11) & 524287) == value) { - *found = 9; - return 9; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 17)) << (19 - 17)) == value) { - *found = 10; - return 10; - } - tmp2 = tmp >> 17; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (19 - 4)) == value) { - *found = 11; - return 11; - } - if (((tmp >> 4) & 524287) == value) { - *found = 12; - return 12; - } - tmp2 = tmp >> 23; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (19 - 10)) == value) { - *found = 13; - return 13; - } - if (((tmp >> 10) & 524287) == value) { - *found = 14; - return 14; - } - tmp2 = tmp >> 29; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (19 - 16)) == value) { - *found = 15; - return 15; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 3)) << (19 - 3)) == value) { - *found = 16; - return 16; - } - if (((tmp >> 3) & 524287) == value) { - *found = 17; - return 17; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 9)) << (19 - 9)) == value) { - *found = 18; - return 18; - } - if (((tmp >> 9) & 524287) == value) { - *found = 19; - return 19; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 15)) << (19 - 15)) == value) { - *found = 20; - return 20; - } - tmp2 = tmp >> 15; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (19 - 2)) == value) { - *found = 21; - return 21; - } - if (((tmp >> 2) & 524287) == value) { - *found = 22; - return 22; - } - tmp2 = tmp >> 21; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 60) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (19 - 8)) == value) { - *found = 23; - return 23; - } - if (((tmp >> 8) & 524287) == value) { - *found = 24; - return 24; - } - tmp2 = tmp >> 27; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 64) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 14)) << (19 - 14)) == value) { - *found = 25; - return 25; - } - tmp2 = tmp >> 14; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 68) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 1)) << (19 - 1)) == value) { - *found = 26; - return 26; - } - if (((tmp >> 1) & 524287) == value) { - *found = 27; - return 27; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 72) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 7)) << (19 - 7)) == value) { - *found = 28; - return 28; - } - if (((tmp >> 7) & 524287) == value) { - *found = 29; - return 29; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 76) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 13)) << (19 - 13)) == value) { - *found = 30; - return 30; - } - if (((tmp >> 13) & 524287) == value) { - *found = 31; - return 31; - } - /* remaining: 0 bits */ - return (76); -} - -static uint32_t -linsearch20_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 1048575) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (20 - 8)) == value) { - *found = 1; - return 1; - } - if (((tmp >> 8) & 1048575) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (20 - 16)) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (20 - 4)) == value) { - *found = 4; - return 4; - } - if (((tmp >> 4) & 1048575) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (20 - 12)) == value) { - *found = 6; - return 6; - } - if (((tmp >> 12) & 1048575) == value) { - *found = 7; - return 7; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 24) */ - if (((tmp >> 0) & 1048575) == value) { - *found = 8; - return 8; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (20 - 8)) == value) { - *found = 9; - return 9; - } - if (((tmp >> 8) & 1048575) == value) { - *found = 10; - return 10; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (20 - 16)) == value) { - *found = 11; - return 11; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (20 - 4)) == value) { - *found = 12; - return 12; - } - if (((tmp >> 4) & 1048575) == value) { - *found = 13; - return 13; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (20 - 12)) == value) { - *found = 14; - return 14; - } - if (((tmp >> 12) & 1048575) == value) { - *found = 15; - return 15; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 44) */ - if (((tmp >> 0) & 1048575) == value) { - *found = 16; - return 16; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (20 - 8)) == value) { - *found = 17; - return 17; - } - if (((tmp >> 8) & 1048575) == value) { - *found = 18; - return 18; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (20 - 16)) == value) { - *found = 19; - return 19; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (20 - 4)) == value) { - *found = 20; - return 20; - } - if (((tmp >> 4) & 1048575) == value) { - *found = 21; - return 21; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 60) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (20 - 12)) == value) { - *found = 22; - return 22; - } - if (((tmp >> 12) & 1048575) == value) { - *found = 23; - return 23; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 64) */ - if (((tmp >> 0) & 1048575) == value) { - *found = 24; - return 24; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 68) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (20 - 8)) == value) { - *found = 25; - return 25; - } - if (((tmp >> 8) & 1048575) == value) { - *found = 26; - return 26; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 72) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (20 - 16)) == value) { - *found = 27; - return 27; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 76) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (20 - 4)) == value) { - *found = 28; - return 28; - } - if (((tmp >> 4) & 1048575) == value) { - *found = 29; - return 29; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 80) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (20 - 12)) == value) { - *found = 30; - return 30; - } - if (((tmp >> 12) & 1048575) == value) { - *found = 31; - return 31; - } - /* remaining: 0 bits */ - return (80); -} - -static uint32_t -linsearch21_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 2097151) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 21; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (21 - 10)) == value) { - *found = 1; - return 1; - } - if (((tmp >> 10) & 2097151) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 31; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 20)) << (21 - 20)) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 9)) << (21 - 9)) == value) { - *found = 4; - return 4; - } - if (((tmp >> 9) & 2097151) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 19)) << (21 - 19)) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 19; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (21 - 8)) == value) { - *found = 7; - return 7; - } - if (((tmp >> 8) & 2097151) == value) { - *found = 8; - return 8; - } - tmp2 = tmp >> 29; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 18)) << (21 - 18)) == value) { - *found = 9; - return 9; - } - tmp2 = tmp >> 18; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 7)) << (21 - 7)) == value) { - *found = 10; - return 10; - } - if (((tmp >> 7) & 2097151) == value) { - *found = 11; - return 11; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 17)) << (21 - 17)) == value) { - *found = 12; - return 12; - } - tmp2 = tmp >> 17; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (21 - 6)) == value) { - *found = 13; - return 13; - } - if (((tmp >> 6) & 2097151) == value) { - *found = 14; - return 14; - } - tmp2 = tmp >> 27; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (21 - 16)) == value) { - *found = 15; - return 15; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 5)) << (21 - 5)) == value) { - *found = 16; - return 16; - } - if (((tmp >> 5) & 2097151) == value) { - *found = 17; - return 17; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 15)) << (21 - 15)) == value) { - *found = 18; - return 18; - } - tmp2 = tmp >> 15; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (21 - 4)) == value) { - *found = 19; - return 19; - } - if (((tmp >> 4) & 2097151) == value) { - *found = 20; - return 20; - } - tmp2 = tmp >> 25; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 60) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 14)) << (21 - 14)) == value) { - *found = 21; - return 21; - } - tmp2 = tmp >> 14; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 64) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 3)) << (21 - 3)) == value) { - *found = 22; - return 22; - } - if (((tmp >> 3) & 2097151) == value) { - *found = 23; - return 23; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 68) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 13)) << (21 - 13)) == value) { - *found = 24; - return 24; - } - tmp2 = tmp >> 13; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 72) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (21 - 2)) == value) { - *found = 25; - return 25; - } - if (((tmp >> 2) & 2097151) == value) { - *found = 26; - return 26; - } - tmp2 = tmp >> 23; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 76) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (21 - 12)) == value) { - *found = 27; - return 27; - } - tmp2 = tmp >> 12; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 80) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 1)) << (21 - 1)) == value) { - *found = 28; - return 28; - } - if (((tmp >> 1) & 2097151) == value) { - *found = 29; - return 29; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 84) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 11)) << (21 - 11)) == value) { - *found = 30; - return 30; - } - if (((tmp >> 11) & 2097151) == value) { - *found = 31; - return 31; - } - /* remaining: 0 bits */ - return (84); -} - -static uint32_t -linsearch22_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 4194303) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (22 - 12)) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 12; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (22 - 2)) == value) { - *found = 2; - return 2; - } - if (((tmp >> 2) & 4194303) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 14)) << (22 - 14)) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 14; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (22 - 4)) == value) { - *found = 5; - return 5; - } - if (((tmp >> 4) & 4194303) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (22 - 16)) == value) { - *found = 7; - return 7; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (22 - 6)) == value) { - *found = 8; - return 8; - } - if (((tmp >> 6) & 4194303) == value) { - *found = 9; - return 9; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 18)) << (22 - 18)) == value) { - *found = 10; - return 10; - } - tmp2 = tmp >> 18; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (22 - 8)) == value) { - *found = 11; - return 11; - } - if (((tmp >> 8) & 4194303) == value) { - *found = 12; - return 12; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 20)) << (22 - 20)) == value) { - *found = 13; - return 13; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (22 - 10)) == value) { - *found = 14; - return 14; - } - if (((tmp >> 10) & 4194303) == value) { - *found = 15; - return 15; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 48) */ - if (((tmp >> 0) & 4194303) == value) { - *found = 16; - return 16; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (22 - 12)) == value) { - *found = 17; - return 17; - } - tmp2 = tmp >> 12; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (22 - 2)) == value) { - *found = 18; - return 18; - } - if (((tmp >> 2) & 4194303) == value) { - *found = 19; - return 19; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 60) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 14)) << (22 - 14)) == value) { - *found = 20; - return 20; - } - tmp2 = tmp >> 14; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 64) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (22 - 4)) == value) { - *found = 21; - return 21; - } - if (((tmp >> 4) & 4194303) == value) { - *found = 22; - return 22; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 68) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (22 - 16)) == value) { - *found = 23; - return 23; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 72) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (22 - 6)) == value) { - *found = 24; - return 24; - } - if (((tmp >> 6) & 4194303) == value) { - *found = 25; - return 25; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 76) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 18)) << (22 - 18)) == value) { - *found = 26; - return 26; - } - tmp2 = tmp >> 18; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 80) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (22 - 8)) == value) { - *found = 27; - return 27; - } - if (((tmp >> 8) & 4194303) == value) { - *found = 28; - return 28; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 84) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 20)) << (22 - 20)) == value) { - *found = 29; - return 29; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 88) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (22 - 10)) == value) { - *found = 30; - return 30; - } - if (((tmp >> 10) & 4194303) == value) { - *found = 31; - return 31; - } - /* remaining: 0 bits */ - return (88); -} - -static uint32_t -linsearch23_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 8388607) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 23; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 14)) << (23 - 14)) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 14; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 5)) << (23 - 5)) == value) { - *found = 2; - return 2; - } - if (((tmp >> 5) & 8388607) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 19)) << (23 - 19)) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 19; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (23 - 10)) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 10; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 1)) << (23 - 1)) == value) { - *found = 6; - return 6; - } - if (((tmp >> 1) & 8388607) == value) { - *found = 7; - return 7; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 15)) << (23 - 15)) == value) { - *found = 8; - return 8; - } - tmp2 = tmp >> 15; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (23 - 6)) == value) { - *found = 9; - return 9; - } - if (((tmp >> 6) & 8388607) == value) { - *found = 10; - return 10; - } - tmp2 = tmp >> 29; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 20)) << (23 - 20)) == value) { - *found = 11; - return 11; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 11)) << (23 - 11)) == value) { - *found = 12; - return 12; - } - tmp2 = tmp >> 11; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (23 - 2)) == value) { - *found = 13; - return 13; - } - if (((tmp >> 2) & 8388607) == value) { - *found = 14; - return 14; - } - tmp2 = tmp >> 25; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (23 - 16)) == value) { - *found = 15; - return 15; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 7)) << (23 - 7)) == value) { - *found = 16; - return 16; - } - if (((tmp >> 7) & 8388607) == value) { - *found = 17; - return 17; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 21)) << (23 - 21)) == value) { - *found = 18; - return 18; - } - tmp2 = tmp >> 21; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 60) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (23 - 12)) == value) { - *found = 19; - return 19; - } - tmp2 = tmp >> 12; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 64) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 3)) << (23 - 3)) == value) { - *found = 20; - return 20; - } - if (((tmp >> 3) & 8388607) == value) { - *found = 21; - return 21; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 68) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 17)) << (23 - 17)) == value) { - *found = 22; - return 22; - } - tmp2 = tmp >> 17; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 72) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (23 - 8)) == value) { - *found = 23; - return 23; - } - if (((tmp >> 8) & 8388607) == value) { - *found = 24; - return 24; - } - tmp2 = tmp >> 31; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 76) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 22)) << (23 - 22)) == value) { - *found = 25; - return 25; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 80) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 13)) << (23 - 13)) == value) { - *found = 26; - return 26; - } - tmp2 = tmp >> 13; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 84) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (23 - 4)) == value) { - *found = 27; - return 27; - } - if (((tmp >> 4) & 8388607) == value) { - *found = 28; - return 28; - } - tmp2 = tmp >> 27; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 88) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 18)) << (23 - 18)) == value) { - *found = 29; - return 29; - } - tmp2 = tmp >> 18; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 92) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 9)) << (23 - 9)) == value) { - *found = 30; - return 30; - } - if (((tmp >> 9) & 8388607) == value) { - *found = 31; - return 31; - } - /* remaining: 0 bits */ - return (92); -} - -static uint32_t -linsearch24_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 16777215) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (24 - 16)) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (24 - 8)) == value) { - *found = 2; - return 2; - } - if (((tmp >> 8) & 16777215) == value) { - *found = 3; - return 3; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 16) */ - if (((tmp >> 0) & 16777215) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (24 - 16)) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (24 - 8)) == value) { - *found = 6; - return 6; - } - if (((tmp >> 8) & 16777215) == value) { - *found = 7; - return 7; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 28) */ - if (((tmp >> 0) & 16777215) == value) { - *found = 8; - return 8; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (24 - 16)) == value) { - *found = 9; - return 9; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (24 - 8)) == value) { - *found = 10; - return 10; - } - if (((tmp >> 8) & 16777215) == value) { - *found = 11; - return 11; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 40) */ - if (((tmp >> 0) & 16777215) == value) { - *found = 12; - return 12; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (24 - 16)) == value) { - *found = 13; - return 13; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (24 - 8)) == value) { - *found = 14; - return 14; - } - if (((tmp >> 8) & 16777215) == value) { - *found = 15; - return 15; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 52) */ - if (((tmp >> 0) & 16777215) == value) { - *found = 16; - return 16; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (24 - 16)) == value) { - *found = 17; - return 17; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 60) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (24 - 8)) == value) { - *found = 18; - return 18; - } - if (((tmp >> 8) & 16777215) == value) { - *found = 19; - return 19; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 64) */ - if (((tmp >> 0) & 16777215) == value) { - *found = 20; - return 20; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 68) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (24 - 16)) == value) { - *found = 21; - return 21; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 72) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (24 - 8)) == value) { - *found = 22; - return 22; - } - if (((tmp >> 8) & 16777215) == value) { - *found = 23; - return 23; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 76) */ - if (((tmp >> 0) & 16777215) == value) { - *found = 24; - return 24; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 80) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (24 - 16)) == value) { - *found = 25; - return 25; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 84) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (24 - 8)) == value) { - *found = 26; - return 26; - } - if (((tmp >> 8) & 16777215) == value) { - *found = 27; - return 27; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 88) */ - if (((tmp >> 0) & 16777215) == value) { - *found = 28; - return 28; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 92) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (24 - 16)) == value) { - *found = 29; - return 29; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 96) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (24 - 8)) == value) { - *found = 30; - return 30; - } - if (((tmp >> 8) & 16777215) == value) { - *found = 31; - return 31; - } - /* remaining: 0 bits */ - return (96); -} - -static uint32_t -linsearch25_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 33554431) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 25; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 18)) << (25 - 18)) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 18; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 11)) << (25 - 11)) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 11; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (25 - 4)) == value) { - *found = 3; - return 3; - } - if (((tmp >> 4) & 33554431) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 29; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 22)) << (25 - 22)) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 15)) << (25 - 15)) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 15; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (25 - 8)) == value) { - *found = 7; - return 7; - } - tmp2 = tmp >> 8; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 1)) << (25 - 1)) == value) { - *found = 8; - return 8; - } - if (((tmp >> 1) & 33554431) == value) { - *found = 9; - return 9; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 19)) << (25 - 19)) == value) { - *found = 10; - return 10; - } - tmp2 = tmp >> 19; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (25 - 12)) == value) { - *found = 11; - return 11; - } - tmp2 = tmp >> 12; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 5)) << (25 - 5)) == value) { - *found = 12; - return 12; - } - if (((tmp >> 5) & 33554431) == value) { - *found = 13; - return 13; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 23)) << (25 - 23)) == value) { - *found = 14; - return 14; - } - tmp2 = tmp >> 23; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (25 - 16)) == value) { - *found = 15; - return 15; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 9)) << (25 - 9)) == value) { - *found = 16; - return 16; - } - tmp2 = tmp >> 9; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 60) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (25 - 2)) == value) { - *found = 17; - return 17; - } - if (((tmp >> 2) & 33554431) == value) { - *found = 18; - return 18; - } - tmp2 = tmp >> 27; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 64) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 20)) << (25 - 20)) == value) { - *found = 19; - return 19; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 68) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 13)) << (25 - 13)) == value) { - *found = 20; - return 20; - } - tmp2 = tmp >> 13; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 72) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (25 - 6)) == value) { - *found = 21; - return 21; - } - if (((tmp >> 6) & 33554431) == value) { - *found = 22; - return 22; - } - tmp2 = tmp >> 31; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 76) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 24)) << (25 - 24)) == value) { - *found = 23; - return 23; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 80) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 17)) << (25 - 17)) == value) { - *found = 24; - return 24; - } - tmp2 = tmp >> 17; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 84) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (25 - 10)) == value) { - *found = 25; - return 25; - } - tmp2 = tmp >> 10; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 88) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 3)) << (25 - 3)) == value) { - *found = 26; - return 26; - } - if (((tmp >> 3) & 33554431) == value) { - *found = 27; - return 27; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 92) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 21)) << (25 - 21)) == value) { - *found = 28; - return 28; - } - tmp2 = tmp >> 21; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 96) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 14)) << (25 - 14)) == value) { - *found = 29; - return 29; - } - tmp2 = tmp >> 14; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 100) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 7)) << (25 - 7)) == value) { - *found = 30; - return 30; - } - if (((tmp >> 7) & 33554431) == value) { - *found = 31; - return 31; - } - /* remaining: 0 bits */ - return (100); -} - -static uint32_t -linsearch26_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 67108863) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 20)) << (26 - 20)) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 14)) << (26 - 14)) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 14; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (26 - 8)) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 8; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (26 - 2)) == value) { - *found = 4; - return 4; - } - if (((tmp >> 2) & 67108863) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 22)) << (26 - 22)) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (26 - 16)) == value) { - *found = 7; - return 7; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (26 - 10)) == value) { - *found = 8; - return 8; - } - tmp2 = tmp >> 10; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (26 - 4)) == value) { - *found = 9; - return 9; - } - if (((tmp >> 4) & 67108863) == value) { - *found = 10; - return 10; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 24)) << (26 - 24)) == value) { - *found = 11; - return 11; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 18)) << (26 - 18)) == value) { - *found = 12; - return 12; - } - tmp2 = tmp >> 18; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (26 - 12)) == value) { - *found = 13; - return 13; - } - tmp2 = tmp >> 12; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (26 - 6)) == value) { - *found = 14; - return 14; - } - if (((tmp >> 6) & 67108863) == value) { - *found = 15; - return 15; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 56) */ - if (((tmp >> 0) & 67108863) == value) { - *found = 16; - return 16; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 60) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 20)) << (26 - 20)) == value) { - *found = 17; - return 17; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 64) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 14)) << (26 - 14)) == value) { - *found = 18; - return 18; - } - tmp2 = tmp >> 14; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 68) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (26 - 8)) == value) { - *found = 19; - return 19; - } - tmp2 = tmp >> 8; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 72) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (26 - 2)) == value) { - *found = 20; - return 20; - } - if (((tmp >> 2) & 67108863) == value) { - *found = 21; - return 21; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 76) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 22)) << (26 - 22)) == value) { - *found = 22; - return 22; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 80) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (26 - 16)) == value) { - *found = 23; - return 23; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 84) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (26 - 10)) == value) { - *found = 24; - return 24; - } - tmp2 = tmp >> 10; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 88) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (26 - 4)) == value) { - *found = 25; - return 25; - } - if (((tmp >> 4) & 67108863) == value) { - *found = 26; - return 26; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 92) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 24)) << (26 - 24)) == value) { - *found = 27; - return 27; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 96) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 18)) << (26 - 18)) == value) { - *found = 28; - return 28; - } - tmp2 = tmp >> 18; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 100) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (26 - 12)) == value) { - *found = 29; - return 29; - } - tmp2 = tmp >> 12; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 104) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (26 - 6)) == value) { - *found = 30; - return 30; - } - if (((tmp >> 6) & 67108863) == value) { - *found = 31; - return 31; - } - /* remaining: 0 bits */ - return (104); -} - -static uint32_t -linsearch27_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 134217727) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 27; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 22)) << (27 - 22)) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 17)) << (27 - 17)) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 17; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (27 - 12)) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 12; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 7)) << (27 - 7)) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 7; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (27 - 2)) == value) { - *found = 5; - return 5; - } - if (((tmp >> 2) & 134217727) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 29; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 24)) << (27 - 24)) == value) { - *found = 7; - return 7; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 19)) << (27 - 19)) == value) { - *found = 8; - return 8; - } - tmp2 = tmp >> 19; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 14)) << (27 - 14)) == value) { - *found = 9; - return 9; - } - tmp2 = tmp >> 14; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 9)) << (27 - 9)) == value) { - *found = 10; - return 10; - } - tmp2 = tmp >> 9; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (27 - 4)) == value) { - *found = 11; - return 11; - } - if (((tmp >> 4) & 134217727) == value) { - *found = 12; - return 12; - } - tmp2 = tmp >> 31; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 26)) << (27 - 26)) == value) { - *found = 13; - return 13; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 21)) << (27 - 21)) == value) { - *found = 14; - return 14; - } - tmp2 = tmp >> 21; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (27 - 16)) == value) { - *found = 15; - return 15; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 60) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 11)) << (27 - 11)) == value) { - *found = 16; - return 16; - } - tmp2 = tmp >> 11; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 64) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (27 - 6)) == value) { - *found = 17; - return 17; - } - tmp2 = tmp >> 6; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 68) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 1)) << (27 - 1)) == value) { - *found = 18; - return 18; - } - if (((tmp >> 1) & 134217727) == value) { - *found = 19; - return 19; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 72) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 23)) << (27 - 23)) == value) { - *found = 20; - return 20; - } - tmp2 = tmp >> 23; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 76) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 18)) << (27 - 18)) == value) { - *found = 21; - return 21; - } - tmp2 = tmp >> 18; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 80) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 13)) << (27 - 13)) == value) { - *found = 22; - return 22; - } - tmp2 = tmp >> 13; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 84) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (27 - 8)) == value) { - *found = 23; - return 23; - } - tmp2 = tmp >> 8; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 88) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 3)) << (27 - 3)) == value) { - *found = 24; - return 24; - } - if (((tmp >> 3) & 134217727) == value) { - *found = 25; - return 25; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 92) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 25)) << (27 - 25)) == value) { - *found = 26; - return 26; - } - tmp2 = tmp >> 25; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 96) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 20)) << (27 - 20)) == value) { - *found = 27; - return 27; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 100) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 15)) << (27 - 15)) == value) { - *found = 28; - return 28; - } - tmp2 = tmp >> 15; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 104) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (27 - 10)) == value) { - *found = 29; - return 29; - } - tmp2 = tmp >> 10; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 108) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 5)) << (27 - 5)) == value) { - *found = 30; - return 30; - } - if (((tmp >> 5) & 134217727) == value) { - *found = 31; - return 31; - } - /* remaining: 0 bits */ - return (108); -} - -static uint32_t -linsearch28_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 268435455) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 24)) << (28 - 24)) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 20)) << (28 - 20)) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (28 - 16)) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (28 - 12)) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 12; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (28 - 8)) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 8; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (28 - 4)) == value) { - *found = 6; - return 6; - } - if (((tmp >> 4) & 268435455) == value) { - *found = 7; - return 7; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 32) */ - if (((tmp >> 0) & 268435455) == value) { - *found = 8; - return 8; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 24)) << (28 - 24)) == value) { - *found = 9; - return 9; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 20)) << (28 - 20)) == value) { - *found = 10; - return 10; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (28 - 16)) == value) { - *found = 11; - return 11; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (28 - 12)) == value) { - *found = 12; - return 12; - } - tmp2 = tmp >> 12; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (28 - 8)) == value) { - *found = 13; - return 13; - } - tmp2 = tmp >> 8; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (28 - 4)) == value) { - *found = 14; - return 14; - } - if (((tmp >> 4) & 268435455) == value) { - *found = 15; - return 15; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 60) */ - if (((tmp >> 0) & 268435455) == value) { - *found = 16; - return 16; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 64) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 24)) << (28 - 24)) == value) { - *found = 17; - return 17; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 68) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 20)) << (28 - 20)) == value) { - *found = 18; - return 18; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 72) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (28 - 16)) == value) { - *found = 19; - return 19; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 76) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (28 - 12)) == value) { - *found = 20; - return 20; - } - tmp2 = tmp >> 12; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 80) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (28 - 8)) == value) { - *found = 21; - return 21; - } - tmp2 = tmp >> 8; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 84) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (28 - 4)) == value) { - *found = 22; - return 22; - } - if (((tmp >> 4) & 268435455) == value) { - *found = 23; - return 23; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 88) */ - if (((tmp >> 0) & 268435455) == value) { - *found = 24; - return 24; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 92) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 24)) << (28 - 24)) == value) { - *found = 25; - return 25; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 96) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 20)) << (28 - 20)) == value) { - *found = 26; - return 26; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 100) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (28 - 16)) == value) { - *found = 27; - return 27; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 104) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (28 - 12)) == value) { - *found = 28; - return 28; - } - tmp2 = tmp >> 12; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 108) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (28 - 8)) == value) { - *found = 29; - return 29; - } - tmp2 = tmp >> 8; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 112) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (28 - 4)) == value) { - *found = 30; - return 30; - } - if (((tmp >> 4) & 268435455) == value) { - *found = 31; - return 31; - } - /* remaining: 0 bits */ - return (112); -} - -static uint32_t -linsearch29_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 536870911) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 29; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 26)) << (29 - 26)) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 23)) << (29 - 23)) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 23; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 20)) << (29 - 20)) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 17)) << (29 - 17)) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 17; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 14)) << (29 - 14)) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 14; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 11)) << (29 - 11)) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 11; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (29 - 8)) == value) { - *found = 7; - return 7; - } - tmp2 = tmp >> 8; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 5)) << (29 - 5)) == value) { - *found = 8; - return 8; - } - tmp2 = tmp >> 5; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (29 - 2)) == value) { - *found = 9; - return 9; - } - if (((tmp >> 2) & 536870911) == value) { - *found = 10; - return 10; - } - tmp2 = tmp >> 31; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 28)) << (29 - 28)) == value) { - *found = 11; - return 11; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 25)) << (29 - 25)) == value) { - *found = 12; - return 12; - } - tmp2 = tmp >> 25; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 22)) << (29 - 22)) == value) { - *found = 13; - return 13; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 19)) << (29 - 19)) == value) { - *found = 14; - return 14; - } - tmp2 = tmp >> 19; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 60) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (29 - 16)) == value) { - *found = 15; - return 15; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 64) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 13)) << (29 - 13)) == value) { - *found = 16; - return 16; - } - tmp2 = tmp >> 13; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 68) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (29 - 10)) == value) { - *found = 17; - return 17; - } - tmp2 = tmp >> 10; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 72) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 7)) << (29 - 7)) == value) { - *found = 18; - return 18; - } - tmp2 = tmp >> 7; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 76) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (29 - 4)) == value) { - *found = 19; - return 19; - } - tmp2 = tmp >> 4; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 80) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 1)) << (29 - 1)) == value) { - *found = 20; - return 20; - } - if (((tmp >> 1) & 536870911) == value) { - *found = 21; - return 21; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 84) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 27)) << (29 - 27)) == value) { - *found = 22; - return 22; - } - tmp2 = tmp >> 27; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 88) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 24)) << (29 - 24)) == value) { - *found = 23; - return 23; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 92) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 21)) << (29 - 21)) == value) { - *found = 24; - return 24; - } - tmp2 = tmp >> 21; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 96) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 18)) << (29 - 18)) == value) { - *found = 25; - return 25; - } - tmp2 = tmp >> 18; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 100) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 15)) << (29 - 15)) == value) { - *found = 26; - return 26; - } - tmp2 = tmp >> 15; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 104) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (29 - 12)) == value) { - *found = 27; - return 27; - } - tmp2 = tmp >> 12; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 108) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 9)) << (29 - 9)) == value) { - *found = 28; - return 28; - } - tmp2 = tmp >> 9; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 112) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (29 - 6)) == value) { - *found = 29; - return 29; - } - tmp2 = tmp >> 6; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 116) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 3)) << (29 - 3)) == value) { - *found = 30; - return 30; - } - if (((tmp >> 3) & 536870911) == value) { - *found = 31; - return 31; - } - /* remaining: 0 bits */ - return (116); -} - -static uint32_t -linsearch30_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 1073741823) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 28)) << (30 - 28)) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 26)) << (30 - 26)) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 24)) << (30 - 24)) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 22)) << (30 - 22)) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 20)) << (30 - 20)) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 18)) << (30 - 18)) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 18; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (30 - 16)) == value) { - *found = 7; - return 7; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 14)) << (30 - 14)) == value) { - *found = 8; - return 8; - } - tmp2 = tmp >> 14; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (30 - 12)) == value) { - *found = 9; - return 9; - } - tmp2 = tmp >> 12; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (30 - 10)) == value) { - *found = 10; - return 10; - } - tmp2 = tmp >> 10; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (30 - 8)) == value) { - *found = 11; - return 11; - } - tmp2 = tmp >> 8; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (30 - 6)) == value) { - *found = 12; - return 12; - } - tmp2 = tmp >> 6; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (30 - 4)) == value) { - *found = 13; - return 13; - } - tmp2 = tmp >> 4; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 60) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (30 - 2)) == value) { - *found = 14; - return 14; - } - if (((tmp >> 2) & 1073741823) == value) { - *found = 15; - return 15; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 64) */ - if (((tmp >> 0) & 1073741823) == value) { - *found = 16; - return 16; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 68) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 28)) << (30 - 28)) == value) { - *found = 17; - return 17; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 72) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 26)) << (30 - 26)) == value) { - *found = 18; - return 18; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 76) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 24)) << (30 - 24)) == value) { - *found = 19; - return 19; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 80) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 22)) << (30 - 22)) == value) { - *found = 20; - return 20; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 84) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 20)) << (30 - 20)) == value) { - *found = 21; - return 21; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 88) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 18)) << (30 - 18)) == value) { - *found = 22; - return 22; - } - tmp2 = tmp >> 18; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 92) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (30 - 16)) == value) { - *found = 23; - return 23; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 96) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 14)) << (30 - 14)) == value) { - *found = 24; - return 24; - } - tmp2 = tmp >> 14; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 100) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (30 - 12)) == value) { - *found = 25; - return 25; - } - tmp2 = tmp >> 12; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 104) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (30 - 10)) == value) { - *found = 26; - return 26; - } - tmp2 = tmp >> 10; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 108) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (30 - 8)) == value) { - *found = 27; - return 27; - } - tmp2 = tmp >> 8; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 112) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (30 - 6)) == value) { - *found = 28; - return 28; - } - tmp2 = tmp >> 6; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 116) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (30 - 4)) == value) { - *found = 29; - return 29; - } - tmp2 = tmp >> 4; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 120) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (30 - 2)) == value) { - *found = 30; - return 30; - } - if (((tmp >> 2) & 1073741823) == value) { - *found = 31; - return 31; - } - /* remaining: 0 bits */ - return (120); -} - -static uint32_t -linsearch31_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 2147483647) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 31; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 30)) << (31 - 30)) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 29)) << (31 - 29)) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 29; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 28)) << (31 - 28)) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 27)) << (31 - 27)) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 27; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 26)) << (31 - 26)) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 25)) << (31 - 25)) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 25; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 24)) << (31 - 24)) == value) { - *found = 7; - return 7; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 23)) << (31 - 23)) == value) { - *found = 8; - return 8; - } - tmp2 = tmp >> 23; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 22)) << (31 - 22)) == value) { - *found = 9; - return 9; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 21)) << (31 - 21)) == value) { - *found = 10; - return 10; - } - tmp2 = tmp >> 21; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 20)) << (31 - 20)) == value) { - *found = 11; - return 11; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 19)) << (31 - 19)) == value) { - *found = 12; - return 12; - } - tmp2 = tmp >> 19; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 18)) << (31 - 18)) == value) { - *found = 13; - return 13; - } - tmp2 = tmp >> 18; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 60) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 17)) << (31 - 17)) == value) { - *found = 14; - return 14; - } - tmp2 = tmp >> 17; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 64) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (31 - 16)) == value) { - *found = 15; - return 15; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 68) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 15)) << (31 - 15)) == value) { - *found = 16; - return 16; - } - tmp2 = tmp >> 15; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 72) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 14)) << (31 - 14)) == value) { - *found = 17; - return 17; - } - tmp2 = tmp >> 14; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 76) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 13)) << (31 - 13)) == value) { - *found = 18; - return 18; - } - tmp2 = tmp >> 13; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 80) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (31 - 12)) == value) { - *found = 19; - return 19; - } - tmp2 = tmp >> 12; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 84) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 11)) << (31 - 11)) == value) { - *found = 20; - return 20; - } - tmp2 = tmp >> 11; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 88) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (31 - 10)) == value) { - *found = 21; - return 21; - } - tmp2 = tmp >> 10; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 92) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 9)) << (31 - 9)) == value) { - *found = 22; - return 22; - } - tmp2 = tmp >> 9; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 96) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (31 - 8)) == value) { - *found = 23; - return 23; - } - tmp2 = tmp >> 8; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 100) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 7)) << (31 - 7)) == value) { - *found = 24; - return 24; - } - tmp2 = tmp >> 7; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 104) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (31 - 6)) == value) { - *found = 25; - return 25; - } - tmp2 = tmp >> 6; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 108) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 5)) << (31 - 5)) == value) { - *found = 26; - return 26; - } - tmp2 = tmp >> 5; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 112) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (31 - 4)) == value) { - *found = 27; - return 27; - } - tmp2 = tmp >> 4; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 116) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 3)) << (31 - 3)) == value) { - *found = 28; - return 28; - } - tmp2 = tmp >> 3; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 120) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (31 - 2)) == value) { - *found = 29; - return 29; - } - tmp2 = tmp >> 2; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 124) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 1)) << (31 - 1)) == value) { - *found = 30; - return 30; - } - if (((tmp >> 1) & 2147483647) == value) { - *found = 31; - return 31; - } - /* remaining: 0 bits */ - return (124); -} - -static uint32_t -linsearch32_32(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t i; - uint32_t *in32 = (uint32_t *)in; - value -= base; - for (i = 0; i < 32; i++) { - if (in32[i] == value) { - *found = i; - return 0; - } - } - return 32 * sizeof(uint32_t); -} - -for_linsearchfunc_t for_linsearch32[33] = { - linsearch0_n, - linsearch1_32, - linsearch2_32, - linsearch3_32, - linsearch4_32, - linsearch5_32, - linsearch6_32, - linsearch7_32, - linsearch8_32, - linsearch9_32, - linsearch10_32, - linsearch11_32, - linsearch12_32, - linsearch13_32, - linsearch14_32, - linsearch15_32, - linsearch16_32, - linsearch17_32, - linsearch18_32, - linsearch19_32, - linsearch20_32, - linsearch21_32, - linsearch22_32, - linsearch23_32, - linsearch24_32, - linsearch25_32, - linsearch26_32, - linsearch27_32, - linsearch28_32, - linsearch29_32, - linsearch30_32, - linsearch31_32, - linsearch32_32 -}; - -static uint32_t -linsearch1_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 1) == value) { - *found = 0; - return 0; - } - if (((tmp >> 1) & 1) == value) { - *found = 1; - return 1; - } - if (((tmp >> 2) & 1) == value) { - *found = 2; - return 2; - } - if (((tmp >> 3) & 1) == value) { - *found = 3; - return 3; - } - if (((tmp >> 4) & 1) == value) { - *found = 4; - return 4; - } - if (((tmp >> 5) & 1) == value) { - *found = 5; - return 5; - } - if (((tmp >> 6) & 1) == value) { - *found = 6; - return 6; - } - if (((tmp >> 7) & 1) == value) { - *found = 7; - return 7; - } - if (((tmp >> 8) & 1) == value) { - *found = 8; - return 8; - } - if (((tmp >> 9) & 1) == value) { - *found = 9; - return 9; - } - if (((tmp >> 10) & 1) == value) { - *found = 10; - return 10; - } - if (((tmp >> 11) & 1) == value) { - *found = 11; - return 11; - } - if (((tmp >> 12) & 1) == value) { - *found = 12; - return 12; - } - if (((tmp >> 13) & 1) == value) { - *found = 13; - return 13; - } - if (((tmp >> 14) & 1) == value) { - *found = 14; - return 14; - } - if (((tmp >> 15) & 1) == value) { - *found = 15; - return 15; - } - /* remaining: 16 bits */ - return (2); -} - -static uint32_t -linsearch2_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 3) == value) { - *found = 0; - return 0; - } - if (((tmp >> 2) & 3) == value) { - *found = 1; - return 1; - } - if (((tmp >> 4) & 3) == value) { - *found = 2; - return 2; - } - if (((tmp >> 6) & 3) == value) { - *found = 3; - return 3; - } - if (((tmp >> 8) & 3) == value) { - *found = 4; - return 4; - } - if (((tmp >> 10) & 3) == value) { - *found = 5; - return 5; - } - if (((tmp >> 12) & 3) == value) { - *found = 6; - return 6; - } - if (((tmp >> 14) & 3) == value) { - *found = 7; - return 7; - } - if (((tmp >> 16) & 3) == value) { - *found = 8; - return 8; - } - if (((tmp >> 18) & 3) == value) { - *found = 9; - return 9; - } - if (((tmp >> 20) & 3) == value) { - *found = 10; - return 10; - } - if (((tmp >> 22) & 3) == value) { - *found = 11; - return 11; - } - if (((tmp >> 24) & 3) == value) { - *found = 12; - return 12; - } - if (((tmp >> 26) & 3) == value) { - *found = 13; - return 13; - } - if (((tmp >> 28) & 3) == value) { - *found = 14; - return 14; - } - if (((tmp >> 30) & 3) == value) { - *found = 15; - return 15; - } - /* remaining: 0 bits */ - return (4); -} - -static uint32_t -linsearch3_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 7) == value) { - *found = 0; - return 0; - } - if (((tmp >> 3) & 7) == value) { - *found = 1; - return 1; - } - if (((tmp >> 6) & 7) == value) { - *found = 2; - return 2; - } - if (((tmp >> 9) & 7) == value) { - *found = 3; - return 3; - } - if (((tmp >> 12) & 7) == value) { - *found = 4; - return 4; - } - if (((tmp >> 15) & 7) == value) { - *found = 5; - return 5; - } - if (((tmp >> 18) & 7) == value) { - *found = 6; - return 6; - } - if (((tmp >> 21) & 7) == value) { - *found = 7; - return 7; - } - if (((tmp >> 24) & 7) == value) { - *found = 8; - return 8; - } - if (((tmp >> 27) & 7) == value) { - *found = 9; - return 9; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 1)) << (3 - 1)) == value) { - *found = 10; - return 10; - } - if (((tmp >> 1) & 7) == value) { - *found = 11; - return 11; - } - if (((tmp >> 4) & 7) == value) { - *found = 12; - return 12; - } - if (((tmp >> 7) & 7) == value) { - *found = 13; - return 13; - } - if (((tmp >> 10) & 7) == value) { - *found = 14; - return 14; - } - if (((tmp >> 13) & 7) == value) { - *found = 15; - return 15; - } - /* remaining: 16 bits */ - return (6); -} - -static uint32_t -linsearch4_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 15) == value) { - *found = 0; - return 0; - } - if (((tmp >> 4) & 15) == value) { - *found = 1; - return 1; - } - if (((tmp >> 8) & 15) == value) { - *found = 2; - return 2; - } - if (((tmp >> 12) & 15) == value) { - *found = 3; - return 3; - } - if (((tmp >> 16) & 15) == value) { - *found = 4; - return 4; - } - if (((tmp >> 20) & 15) == value) { - *found = 5; - return 5; - } - if (((tmp >> 24) & 15) == value) { - *found = 6; - return 6; - } - if (((tmp >> 28) & 15) == value) { - *found = 7; - return 7; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 8) */ - if (((tmp >> 0) & 15) == value) { - *found = 8; - return 8; - } - if (((tmp >> 4) & 15) == value) { - *found = 9; - return 9; - } - if (((tmp >> 8) & 15) == value) { - *found = 10; - return 10; - } - if (((tmp >> 12) & 15) == value) { - *found = 11; - return 11; - } - if (((tmp >> 16) & 15) == value) { - *found = 12; - return 12; - } - if (((tmp >> 20) & 15) == value) { - *found = 13; - return 13; - } - if (((tmp >> 24) & 15) == value) { - *found = 14; - return 14; - } - if (((tmp >> 28) & 15) == value) { - *found = 15; - return 15; - } - /* remaining: 0 bits */ - return (8); -} - -static uint32_t -linsearch5_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 31) == value) { - *found = 0; - return 0; - } - if (((tmp >> 5) & 31) == value) { - *found = 1; - return 1; - } - if (((tmp >> 10) & 31) == value) { - *found = 2; - return 2; - } - if (((tmp >> 15) & 31) == value) { - *found = 3; - return 3; - } - if (((tmp >> 20) & 31) == value) { - *found = 4; - return 4; - } - if (((tmp >> 25) & 31) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 3)) << (5 - 3)) == value) { - *found = 6; - return 6; - } - if (((tmp >> 3) & 31) == value) { - *found = 7; - return 7; - } - if (((tmp >> 8) & 31) == value) { - *found = 8; - return 8; - } - if (((tmp >> 13) & 31) == value) { - *found = 9; - return 9; - } - if (((tmp >> 18) & 31) == value) { - *found = 10; - return 10; - } - if (((tmp >> 23) & 31) == value) { - *found = 11; - return 11; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 1)) << (5 - 1)) == value) { - *found = 12; - return 12; - } - if (((tmp >> 1) & 31) == value) { - *found = 13; - return 13; - } - if (((tmp >> 6) & 31) == value) { - *found = 14; - return 14; - } - if (((tmp >> 11) & 31) == value) { - *found = 15; - return 15; - } - /* remaining: 16 bits */ - return (10); -} - -static uint32_t -linsearch6_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 63) == value) { - *found = 0; - return 0; - } - if (((tmp >> 6) & 63) == value) { - *found = 1; - return 1; - } - if (((tmp >> 12) & 63) == value) { - *found = 2; - return 2; - } - if (((tmp >> 18) & 63) == value) { - *found = 3; - return 3; - } - if (((tmp >> 24) & 63) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (6 - 4)) == value) { - *found = 5; - return 5; - } - if (((tmp >> 4) & 63) == value) { - *found = 6; - return 6; - } - if (((tmp >> 10) & 63) == value) { - *found = 7; - return 7; - } - if (((tmp >> 16) & 63) == value) { - *found = 8; - return 8; - } - if (((tmp >> 22) & 63) == value) { - *found = 9; - return 9; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (6 - 2)) == value) { - *found = 10; - return 10; - } - if (((tmp >> 2) & 63) == value) { - *found = 11; - return 11; - } - if (((tmp >> 8) & 63) == value) { - *found = 12; - return 12; - } - if (((tmp >> 14) & 63) == value) { - *found = 13; - return 13; - } - if (((tmp >> 20) & 63) == value) { - *found = 14; - return 14; - } - if (((tmp >> 26) & 63) == value) { - *found = 15; - return 15; - } - /* remaining: 0 bits */ - return (12); -} - -static uint32_t -linsearch7_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 127) == value) { - *found = 0; - return 0; - } - if (((tmp >> 7) & 127) == value) { - *found = 1; - return 1; - } - if (((tmp >> 14) & 127) == value) { - *found = 2; - return 2; - } - if (((tmp >> 21) & 127) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 3)) << (7 - 3)) == value) { - *found = 4; - return 4; - } - if (((tmp >> 3) & 127) == value) { - *found = 5; - return 5; - } - if (((tmp >> 10) & 127) == value) { - *found = 6; - return 6; - } - if (((tmp >> 17) & 127) == value) { - *found = 7; - return 7; - } - if (((tmp >> 24) & 127) == value) { - *found = 8; - return 8; - } - tmp2 = tmp >> 31; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (7 - 6)) == value) { - *found = 9; - return 9; - } - if (((tmp >> 6) & 127) == value) { - *found = 10; - return 10; - } - if (((tmp >> 13) & 127) == value) { - *found = 11; - return 11; - } - if (((tmp >> 20) & 127) == value) { - *found = 12; - return 12; - } - tmp2 = tmp >> 27; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (7 - 2)) == value) { - *found = 13; - return 13; - } - if (((tmp >> 2) & 127) == value) { - *found = 14; - return 14; - } - if (((tmp >> 9) & 127) == value) { - *found = 15; - return 15; - } - /* remaining: 16 bits */ - return (14); -} - -static uint32_t -linsearch8_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 255) == value) { - *found = 0; - return 0; - } - if (((tmp >> 8) & 255) == value) { - *found = 1; - return 1; - } - if (((tmp >> 16) & 255) == value) { - *found = 2; - return 2; - } - if (((tmp >> 24) & 255) == value) { - *found = 3; - return 3; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 8) */ - if (((tmp >> 0) & 255) == value) { - *found = 4; - return 4; - } - if (((tmp >> 8) & 255) == value) { - *found = 5; - return 5; - } - if (((tmp >> 16) & 255) == value) { - *found = 6; - return 6; - } - if (((tmp >> 24) & 255) == value) { - *found = 7; - return 7; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 12) */ - if (((tmp >> 0) & 255) == value) { - *found = 8; - return 8; - } - if (((tmp >> 8) & 255) == value) { - *found = 9; - return 9; - } - if (((tmp >> 16) & 255) == value) { - *found = 10; - return 10; - } - if (((tmp >> 24) & 255) == value) { - *found = 11; - return 11; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 16) */ - if (((tmp >> 0) & 255) == value) { - *found = 12; - return 12; - } - if (((tmp >> 8) & 255) == value) { - *found = 13; - return 13; - } - if (((tmp >> 16) & 255) == value) { - *found = 14; - return 14; - } - if (((tmp >> 24) & 255) == value) { - *found = 15; - return 15; - } - /* remaining: 0 bits */ - return (16); -} - -static uint32_t -linsearch9_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 511) == value) { - *found = 0; - return 0; - } - if (((tmp >> 9) & 511) == value) { - *found = 1; - return 1; - } - if (((tmp >> 18) & 511) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 27; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (9 - 4)) == value) { - *found = 3; - return 3; - } - if (((tmp >> 4) & 511) == value) { - *found = 4; - return 4; - } - if (((tmp >> 13) & 511) == value) { - *found = 5; - return 5; - } - if (((tmp >> 22) & 511) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 31; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (9 - 8)) == value) { - *found = 7; - return 7; - } - if (((tmp >> 8) & 511) == value) { - *found = 8; - return 8; - } - if (((tmp >> 17) & 511) == value) { - *found = 9; - return 9; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 3)) << (9 - 3)) == value) { - *found = 10; - return 10; - } - if (((tmp >> 3) & 511) == value) { - *found = 11; - return 11; - } - if (((tmp >> 12) & 511) == value) { - *found = 12; - return 12; - } - if (((tmp >> 21) & 511) == value) { - *found = 13; - return 13; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 7)) << (9 - 7)) == value) { - *found = 14; - return 14; - } - if (((tmp >> 7) & 511) == value) { - *found = 15; - return 15; - } - /* remaining: 16 bits */ - return (18); -} - -static uint32_t -linsearch10_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 1023) == value) { - *found = 0; - return 0; - } - if (((tmp >> 10) & 1023) == value) { - *found = 1; - return 1; - } - if (((tmp >> 20) & 1023) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (10 - 8)) == value) { - *found = 3; - return 3; - } - if (((tmp >> 8) & 1023) == value) { - *found = 4; - return 4; - } - if (((tmp >> 18) & 1023) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (10 - 6)) == value) { - *found = 6; - return 6; - } - if (((tmp >> 6) & 1023) == value) { - *found = 7; - return 7; - } - if (((tmp >> 16) & 1023) == value) { - *found = 8; - return 8; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (10 - 4)) == value) { - *found = 9; - return 9; - } - if (((tmp >> 4) & 1023) == value) { - *found = 10; - return 10; - } - if (((tmp >> 14) & 1023) == value) { - *found = 11; - return 11; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (10 - 2)) == value) { - *found = 12; - return 12; - } - if (((tmp >> 2) & 1023) == value) { - *found = 13; - return 13; - } - if (((tmp >> 12) & 1023) == value) { - *found = 14; - return 14; - } - if (((tmp >> 22) & 1023) == value) { - *found = 15; - return 15; - } - /* remaining: 0 bits */ - return (20); -} - -static uint32_t -linsearch11_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 2047) == value) { - *found = 0; - return 0; - } - if (((tmp >> 11) & 2047) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 1)) << (11 - 1)) == value) { - *found = 2; - return 2; - } - if (((tmp >> 1) & 2047) == value) { - *found = 3; - return 3; - } - if (((tmp >> 12) & 2047) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 23; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (11 - 2)) == value) { - *found = 5; - return 5; - } - if (((tmp >> 2) & 2047) == value) { - *found = 6; - return 6; - } - if (((tmp >> 13) & 2047) == value) { - *found = 7; - return 7; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 3)) << (11 - 3)) == value) { - *found = 8; - return 8; - } - if (((tmp >> 3) & 2047) == value) { - *found = 9; - return 9; - } - if (((tmp >> 14) & 2047) == value) { - *found = 10; - return 10; - } - tmp2 = tmp >> 25; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (11 - 4)) == value) { - *found = 11; - return 11; - } - if (((tmp >> 4) & 2047) == value) { - *found = 12; - return 12; - } - if (((tmp >> 15) & 2047) == value) { - *found = 13; - return 13; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 5)) << (11 - 5)) == value) { - *found = 14; - return 14; - } - if (((tmp >> 5) & 2047) == value) { - *found = 15; - return 15; - } - /* remaining: 16 bits */ - return (22); -} - -static uint32_t -linsearch12_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 4095) == value) { - *found = 0; - return 0; - } - if (((tmp >> 12) & 4095) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (12 - 4)) == value) { - *found = 2; - return 2; - } - if (((tmp >> 4) & 4095) == value) { - *found = 3; - return 3; - } - if (((tmp >> 16) & 4095) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (12 - 8)) == value) { - *found = 5; - return 5; - } - if (((tmp >> 8) & 4095) == value) { - *found = 6; - return 6; - } - if (((tmp >> 20) & 4095) == value) { - *found = 7; - return 7; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 16) */ - if (((tmp >> 0) & 4095) == value) { - *found = 8; - return 8; - } - if (((tmp >> 12) & 4095) == value) { - *found = 9; - return 9; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (12 - 4)) == value) { - *found = 10; - return 10; - } - if (((tmp >> 4) & 4095) == value) { - *found = 11; - return 11; - } - if (((tmp >> 16) & 4095) == value) { - *found = 12; - return 12; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (12 - 8)) == value) { - *found = 13; - return 13; - } - if (((tmp >> 8) & 4095) == value) { - *found = 14; - return 14; - } - if (((tmp >> 20) & 4095) == value) { - *found = 15; - return 15; - } - /* remaining: 0 bits */ - return (24); -} - -static uint32_t -linsearch13_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 8191) == value) { - *found = 0; - return 0; - } - if (((tmp >> 13) & 8191) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 7)) << (13 - 7)) == value) { - *found = 2; - return 2; - } - if (((tmp >> 7) & 8191) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 1)) << (13 - 1)) == value) { - *found = 4; - return 4; - } - if (((tmp >> 1) & 8191) == value) { - *found = 5; - return 5; - } - if (((tmp >> 14) & 8191) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 27; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (13 - 8)) == value) { - *found = 7; - return 7; - } - if (((tmp >> 8) & 8191) == value) { - *found = 8; - return 8; - } - tmp2 = tmp >> 21; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (13 - 2)) == value) { - *found = 9; - return 9; - } - if (((tmp >> 2) & 8191) == value) { - *found = 10; - return 10; - } - if (((tmp >> 15) & 8191) == value) { - *found = 11; - return 11; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 9)) << (13 - 9)) == value) { - *found = 12; - return 12; - } - if (((tmp >> 9) & 8191) == value) { - *found = 13; - return 13; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 3)) << (13 - 3)) == value) { - *found = 14; - return 14; - } - if (((tmp >> 3) & 8191) == value) { - *found = 15; - return 15; - } - /* remaining: 16 bits */ - return (26); -} - -static uint32_t -linsearch14_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 16383) == value) { - *found = 0; - return 0; - } - if (((tmp >> 14) & 16383) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (14 - 10)) == value) { - *found = 2; - return 2; - } - if (((tmp >> 10) & 16383) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (14 - 6)) == value) { - *found = 4; - return 4; - } - if (((tmp >> 6) & 16383) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (14 - 2)) == value) { - *found = 6; - return 6; - } - if (((tmp >> 2) & 16383) == value) { - *found = 7; - return 7; - } - if (((tmp >> 16) & 16383) == value) { - *found = 8; - return 8; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (14 - 12)) == value) { - *found = 9; - return 9; - } - if (((tmp >> 12) & 16383) == value) { - *found = 10; - return 10; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (14 - 8)) == value) { - *found = 11; - return 11; - } - if (((tmp >> 8) & 16383) == value) { - *found = 12; - return 12; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (14 - 4)) == value) { - *found = 13; - return 13; - } - if (((tmp >> 4) & 16383) == value) { - *found = 14; - return 14; - } - if (((tmp >> 18) & 16383) == value) { - *found = 15; - return 15; - } - /* remaining: 0 bits */ - return (28); -} - -static uint32_t -linsearch15_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 32767) == value) { - *found = 0; - return 0; - } - if (((tmp >> 15) & 32767) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 13)) << (15 - 13)) == value) { - *found = 2; - return 2; - } - if (((tmp >> 13) & 32767) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 11)) << (15 - 11)) == value) { - *found = 4; - return 4; - } - if (((tmp >> 11) & 32767) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 9)) << (15 - 9)) == value) { - *found = 6; - return 6; - } - if (((tmp >> 9) & 32767) == value) { - *found = 7; - return 7; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 7)) << (15 - 7)) == value) { - *found = 8; - return 8; - } - if (((tmp >> 7) & 32767) == value) { - *found = 9; - return 9; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 5)) << (15 - 5)) == value) { - *found = 10; - return 10; - } - if (((tmp >> 5) & 32767) == value) { - *found = 11; - return 11; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 3)) << (15 - 3)) == value) { - *found = 12; - return 12; - } - if (((tmp >> 3) & 32767) == value) { - *found = 13; - return 13; - } - tmp2 = tmp >> 18; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 1)) << (15 - 1)) == value) { - *found = 14; - return 14; - } - if (((tmp >> 1) & 32767) == value) { - *found = 15; - return 15; - } - /* remaining: 16 bits */ - return (30); -} - -static uint32_t -linsearch16_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 65535) == value) { - *found = 0; - return 0; - } - if (((tmp >> 16) & 65535) == value) { - *found = 1; - return 1; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 8) */ - if (((tmp >> 0) & 65535) == value) { - *found = 2; - return 2; - } - if (((tmp >> 16) & 65535) == value) { - *found = 3; - return 3; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 12) */ - if (((tmp >> 0) & 65535) == value) { - *found = 4; - return 4; - } - if (((tmp >> 16) & 65535) == value) { - *found = 5; - return 5; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 16) */ - if (((tmp >> 0) & 65535) == value) { - *found = 6; - return 6; - } - if (((tmp >> 16) & 65535) == value) { - *found = 7; - return 7; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 20) */ - if (((tmp >> 0) & 65535) == value) { - *found = 8; - return 8; - } - if (((tmp >> 16) & 65535) == value) { - *found = 9; - return 9; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 24) */ - if (((tmp >> 0) & 65535) == value) { - *found = 10; - return 10; - } - if (((tmp >> 16) & 65535) == value) { - *found = 11; - return 11; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 28) */ - if (((tmp >> 0) & 65535) == value) { - *found = 12; - return 12; - } - if (((tmp >> 16) & 65535) == value) { - *found = 13; - return 13; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 32) */ - if (((tmp >> 0) & 65535) == value) { - *found = 14; - return 14; - } - if (((tmp >> 16) & 65535) == value) { - *found = 15; - return 15; - } - /* remaining: 0 bits */ - return (32); -} - -static uint32_t -linsearch17_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 131071) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 17; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (17 - 2)) == value) { - *found = 1; - return 1; - } - if (((tmp >> 2) & 131071) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 19; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (17 - 4)) == value) { - *found = 3; - return 3; - } - if (((tmp >> 4) & 131071) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 21; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (17 - 6)) == value) { - *found = 5; - return 5; - } - if (((tmp >> 6) & 131071) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 23; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (17 - 8)) == value) { - *found = 7; - return 7; - } - if (((tmp >> 8) & 131071) == value) { - *found = 8; - return 8; - } - tmp2 = tmp >> 25; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (17 - 10)) == value) { - *found = 9; - return 9; - } - if (((tmp >> 10) & 131071) == value) { - *found = 10; - return 10; - } - tmp2 = tmp >> 27; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (17 - 12)) == value) { - *found = 11; - return 11; - } - if (((tmp >> 12) & 131071) == value) { - *found = 12; - return 12; - } - tmp2 = tmp >> 29; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 14)) << (17 - 14)) == value) { - *found = 13; - return 13; - } - if (((tmp >> 14) & 131071) == value) { - *found = 14; - return 14; - } - tmp2 = tmp >> 31; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (17 - 16)) == value) { - *found = 15; - return 15; - } - /* remaining: 16 bits */ - return (34); -} - -static uint32_t -linsearch18_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 262143) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 18; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (18 - 4)) == value) { - *found = 1; - return 1; - } - if (((tmp >> 4) & 262143) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (18 - 8)) == value) { - *found = 3; - return 3; - } - if (((tmp >> 8) & 262143) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (18 - 12)) == value) { - *found = 5; - return 5; - } - if (((tmp >> 12) & 262143) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (18 - 16)) == value) { - *found = 7; - return 7; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (18 - 2)) == value) { - *found = 8; - return 8; - } - if (((tmp >> 2) & 262143) == value) { - *found = 9; - return 9; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (18 - 6)) == value) { - *found = 10; - return 10; - } - if (((tmp >> 6) & 262143) == value) { - *found = 11; - return 11; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (18 - 10)) == value) { - *found = 12; - return 12; - } - if (((tmp >> 10) & 262143) == value) { - *found = 13; - return 13; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 14)) << (18 - 14)) == value) { - *found = 14; - return 14; - } - if (((tmp >> 14) & 262143) == value) { - *found = 15; - return 15; - } - /* remaining: 0 bits */ - return (36); -} - -static uint32_t -linsearch19_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 524287) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 19; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (19 - 6)) == value) { - *found = 1; - return 1; - } - if (((tmp >> 6) & 524287) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 25; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (19 - 12)) == value) { - *found = 3; - return 3; - } - if (((tmp >> 12) & 524287) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 31; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 18)) << (19 - 18)) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 18; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 5)) << (19 - 5)) == value) { - *found = 6; - return 6; - } - if (((tmp >> 5) & 524287) == value) { - *found = 7; - return 7; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 11)) << (19 - 11)) == value) { - *found = 8; - return 8; - } - if (((tmp >> 11) & 524287) == value) { - *found = 9; - return 9; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 17)) << (19 - 17)) == value) { - *found = 10; - return 10; - } - tmp2 = tmp >> 17; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (19 - 4)) == value) { - *found = 11; - return 11; - } - if (((tmp >> 4) & 524287) == value) { - *found = 12; - return 12; - } - tmp2 = tmp >> 23; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (19 - 10)) == value) { - *found = 13; - return 13; - } - if (((tmp >> 10) & 524287) == value) { - *found = 14; - return 14; - } - tmp2 = tmp >> 29; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (19 - 16)) == value) { - *found = 15; - return 15; - } - /* remaining: 16 bits */ - return (38); -} - -static uint32_t -linsearch20_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 1048575) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (20 - 8)) == value) { - *found = 1; - return 1; - } - if (((tmp >> 8) & 1048575) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (20 - 16)) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (20 - 4)) == value) { - *found = 4; - return 4; - } - if (((tmp >> 4) & 1048575) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (20 - 12)) == value) { - *found = 6; - return 6; - } - if (((tmp >> 12) & 1048575) == value) { - *found = 7; - return 7; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 24) */ - if (((tmp >> 0) & 1048575) == value) { - *found = 8; - return 8; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (20 - 8)) == value) { - *found = 9; - return 9; - } - if (((tmp >> 8) & 1048575) == value) { - *found = 10; - return 10; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (20 - 16)) == value) { - *found = 11; - return 11; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (20 - 4)) == value) { - *found = 12; - return 12; - } - if (((tmp >> 4) & 1048575) == value) { - *found = 13; - return 13; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (20 - 12)) == value) { - *found = 14; - return 14; - } - if (((tmp >> 12) & 1048575) == value) { - *found = 15; - return 15; - } - /* remaining: 0 bits */ - return (40); -} - -static uint32_t -linsearch21_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 2097151) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 21; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (21 - 10)) == value) { - *found = 1; - return 1; - } - if (((tmp >> 10) & 2097151) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 31; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 20)) << (21 - 20)) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 9)) << (21 - 9)) == value) { - *found = 4; - return 4; - } - if (((tmp >> 9) & 2097151) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 19)) << (21 - 19)) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 19; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (21 - 8)) == value) { - *found = 7; - return 7; - } - if (((tmp >> 8) & 2097151) == value) { - *found = 8; - return 8; - } - tmp2 = tmp >> 29; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 18)) << (21 - 18)) == value) { - *found = 9; - return 9; - } - tmp2 = tmp >> 18; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 7)) << (21 - 7)) == value) { - *found = 10; - return 10; - } - if (((tmp >> 7) & 2097151) == value) { - *found = 11; - return 11; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 17)) << (21 - 17)) == value) { - *found = 12; - return 12; - } - tmp2 = tmp >> 17; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (21 - 6)) == value) { - *found = 13; - return 13; - } - if (((tmp >> 6) & 2097151) == value) { - *found = 14; - return 14; - } - tmp2 = tmp >> 27; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (21 - 16)) == value) { - *found = 15; - return 15; - } - /* remaining: 16 bits */ - return (42); -} - -static uint32_t -linsearch22_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 4194303) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (22 - 12)) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 12; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (22 - 2)) == value) { - *found = 2; - return 2; - } - if (((tmp >> 2) & 4194303) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 14)) << (22 - 14)) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 14; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (22 - 4)) == value) { - *found = 5; - return 5; - } - if (((tmp >> 4) & 4194303) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (22 - 16)) == value) { - *found = 7; - return 7; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (22 - 6)) == value) { - *found = 8; - return 8; - } - if (((tmp >> 6) & 4194303) == value) { - *found = 9; - return 9; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 18)) << (22 - 18)) == value) { - *found = 10; - return 10; - } - tmp2 = tmp >> 18; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (22 - 8)) == value) { - *found = 11; - return 11; - } - if (((tmp >> 8) & 4194303) == value) { - *found = 12; - return 12; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 20)) << (22 - 20)) == value) { - *found = 13; - return 13; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (22 - 10)) == value) { - *found = 14; - return 14; - } - if (((tmp >> 10) & 4194303) == value) { - *found = 15; - return 15; - } - /* remaining: 0 bits */ - return (44); -} - -static uint32_t -linsearch23_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 8388607) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 23; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 14)) << (23 - 14)) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 14; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 5)) << (23 - 5)) == value) { - *found = 2; - return 2; - } - if (((tmp >> 5) & 8388607) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 19)) << (23 - 19)) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 19; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (23 - 10)) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 10; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 1)) << (23 - 1)) == value) { - *found = 6; - return 6; - } - if (((tmp >> 1) & 8388607) == value) { - *found = 7; - return 7; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 15)) << (23 - 15)) == value) { - *found = 8; - return 8; - } - tmp2 = tmp >> 15; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (23 - 6)) == value) { - *found = 9; - return 9; - } - if (((tmp >> 6) & 8388607) == value) { - *found = 10; - return 10; - } - tmp2 = tmp >> 29; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 20)) << (23 - 20)) == value) { - *found = 11; - return 11; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 11)) << (23 - 11)) == value) { - *found = 12; - return 12; - } - tmp2 = tmp >> 11; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (23 - 2)) == value) { - *found = 13; - return 13; - } - if (((tmp >> 2) & 8388607) == value) { - *found = 14; - return 14; - } - tmp2 = tmp >> 25; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (23 - 16)) == value) { - *found = 15; - return 15; - } - /* remaining: 16 bits */ - return (46); -} - -static uint32_t -linsearch24_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 16777215) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (24 - 16)) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (24 - 8)) == value) { - *found = 2; - return 2; - } - if (((tmp >> 8) & 16777215) == value) { - *found = 3; - return 3; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 16) */ - if (((tmp >> 0) & 16777215) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (24 - 16)) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (24 - 8)) == value) { - *found = 6; - return 6; - } - if (((tmp >> 8) & 16777215) == value) { - *found = 7; - return 7; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 28) */ - if (((tmp >> 0) & 16777215) == value) { - *found = 8; - return 8; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (24 - 16)) == value) { - *found = 9; - return 9; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (24 - 8)) == value) { - *found = 10; - return 10; - } - if (((tmp >> 8) & 16777215) == value) { - *found = 11; - return 11; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 40) */ - if (((tmp >> 0) & 16777215) == value) { - *found = 12; - return 12; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (24 - 16)) == value) { - *found = 13; - return 13; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (24 - 8)) == value) { - *found = 14; - return 14; - } - if (((tmp >> 8) & 16777215) == value) { - *found = 15; - return 15; - } - /* remaining: 0 bits */ - return (48); -} - -static uint32_t -linsearch25_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 33554431) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 25; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 18)) << (25 - 18)) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 18; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 11)) << (25 - 11)) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 11; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (25 - 4)) == value) { - *found = 3; - return 3; - } - if (((tmp >> 4) & 33554431) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 29; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 22)) << (25 - 22)) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 15)) << (25 - 15)) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 15; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (25 - 8)) == value) { - *found = 7; - return 7; - } - tmp2 = tmp >> 8; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 1)) << (25 - 1)) == value) { - *found = 8; - return 8; - } - if (((tmp >> 1) & 33554431) == value) { - *found = 9; - return 9; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 19)) << (25 - 19)) == value) { - *found = 10; - return 10; - } - tmp2 = tmp >> 19; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (25 - 12)) == value) { - *found = 11; - return 11; - } - tmp2 = tmp >> 12; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 5)) << (25 - 5)) == value) { - *found = 12; - return 12; - } - if (((tmp >> 5) & 33554431) == value) { - *found = 13; - return 13; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 23)) << (25 - 23)) == value) { - *found = 14; - return 14; - } - tmp2 = tmp >> 23; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (25 - 16)) == value) { - *found = 15; - return 15; - } - /* remaining: 16 bits */ - return (50); -} - -static uint32_t -linsearch26_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 67108863) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 20)) << (26 - 20)) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 14)) << (26 - 14)) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 14; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (26 - 8)) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 8; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (26 - 2)) == value) { - *found = 4; - return 4; - } - if (((tmp >> 2) & 67108863) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 22)) << (26 - 22)) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (26 - 16)) == value) { - *found = 7; - return 7; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (26 - 10)) == value) { - *found = 8; - return 8; - } - tmp2 = tmp >> 10; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (26 - 4)) == value) { - *found = 9; - return 9; - } - if (((tmp >> 4) & 67108863) == value) { - *found = 10; - return 10; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 24)) << (26 - 24)) == value) { - *found = 11; - return 11; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 18)) << (26 - 18)) == value) { - *found = 12; - return 12; - } - tmp2 = tmp >> 18; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (26 - 12)) == value) { - *found = 13; - return 13; - } - tmp2 = tmp >> 12; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (26 - 6)) == value) { - *found = 14; - return 14; - } - if (((tmp >> 6) & 67108863) == value) { - *found = 15; - return 15; - } - /* remaining: 0 bits */ - return (52); -} - -static uint32_t -linsearch27_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 134217727) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 27; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 22)) << (27 - 22)) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 17)) << (27 - 17)) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 17; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (27 - 12)) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 12; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 7)) << (27 - 7)) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 7; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (27 - 2)) == value) { - *found = 5; - return 5; - } - if (((tmp >> 2) & 134217727) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 29; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 24)) << (27 - 24)) == value) { - *found = 7; - return 7; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 19)) << (27 - 19)) == value) { - *found = 8; - return 8; - } - tmp2 = tmp >> 19; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 14)) << (27 - 14)) == value) { - *found = 9; - return 9; - } - tmp2 = tmp >> 14; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 9)) << (27 - 9)) == value) { - *found = 10; - return 10; - } - tmp2 = tmp >> 9; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (27 - 4)) == value) { - *found = 11; - return 11; - } - if (((tmp >> 4) & 134217727) == value) { - *found = 12; - return 12; - } - tmp2 = tmp >> 31; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 26)) << (27 - 26)) == value) { - *found = 13; - return 13; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 21)) << (27 - 21)) == value) { - *found = 14; - return 14; - } - tmp2 = tmp >> 21; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (27 - 16)) == value) { - *found = 15; - return 15; - } - /* remaining: 16 bits */ - return (54); -} - -static uint32_t -linsearch28_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 268435455) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 24)) << (28 - 24)) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 20)) << (28 - 20)) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (28 - 16)) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (28 - 12)) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 12; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (28 - 8)) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 8; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (28 - 4)) == value) { - *found = 6; - return 6; - } - if (((tmp >> 4) & 268435455) == value) { - *found = 7; - return 7; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 32) */ - if (((tmp >> 0) & 268435455) == value) { - *found = 8; - return 8; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 24)) << (28 - 24)) == value) { - *found = 9; - return 9; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 20)) << (28 - 20)) == value) { - *found = 10; - return 10; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (28 - 16)) == value) { - *found = 11; - return 11; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (28 - 12)) == value) { - *found = 12; - return 12; - } - tmp2 = tmp >> 12; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (28 - 8)) == value) { - *found = 13; - return 13; - } - tmp2 = tmp >> 8; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (28 - 4)) == value) { - *found = 14; - return 14; - } - if (((tmp >> 4) & 268435455) == value) { - *found = 15; - return 15; - } - /* remaining: 0 bits */ - return (56); -} - -static uint32_t -linsearch29_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 536870911) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 29; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 26)) << (29 - 26)) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 23)) << (29 - 23)) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 23; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 20)) << (29 - 20)) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 17)) << (29 - 17)) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 17; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 14)) << (29 - 14)) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 14; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 11)) << (29 - 11)) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 11; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (29 - 8)) == value) { - *found = 7; - return 7; - } - tmp2 = tmp >> 8; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 5)) << (29 - 5)) == value) { - *found = 8; - return 8; - } - tmp2 = tmp >> 5; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (29 - 2)) == value) { - *found = 9; - return 9; - } - if (((tmp >> 2) & 536870911) == value) { - *found = 10; - return 10; - } - tmp2 = tmp >> 31; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 28)) << (29 - 28)) == value) { - *found = 11; - return 11; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 25)) << (29 - 25)) == value) { - *found = 12; - return 12; - } - tmp2 = tmp >> 25; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 22)) << (29 - 22)) == value) { - *found = 13; - return 13; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 19)) << (29 - 19)) == value) { - *found = 14; - return 14; - } - tmp2 = tmp >> 19; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 60) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (29 - 16)) == value) { - *found = 15; - return 15; - } - /* remaining: 16 bits */ - return (58); -} - -static uint32_t -linsearch30_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 1073741823) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 28)) << (30 - 28)) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 26)) << (30 - 26)) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 24)) << (30 - 24)) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 22)) << (30 - 22)) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 20)) << (30 - 20)) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 18)) << (30 - 18)) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 18; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (30 - 16)) == value) { - *found = 7; - return 7; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 14)) << (30 - 14)) == value) { - *found = 8; - return 8; - } - tmp2 = tmp >> 14; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (30 - 12)) == value) { - *found = 9; - return 9; - } - tmp2 = tmp >> 12; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (30 - 10)) == value) { - *found = 10; - return 10; - } - tmp2 = tmp >> 10; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (30 - 8)) == value) { - *found = 11; - return 11; - } - tmp2 = tmp >> 8; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (30 - 6)) == value) { - *found = 12; - return 12; - } - tmp2 = tmp >> 6; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (30 - 4)) == value) { - *found = 13; - return 13; - } - tmp2 = tmp >> 4; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 60) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (30 - 2)) == value) { - *found = 14; - return 14; - } - if (((tmp >> 2) & 1073741823) == value) { - *found = 15; - return 15; - } - /* remaining: 0 bits */ - return (60); -} - -static uint32_t -linsearch31_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 2147483647) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 31; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 30)) << (31 - 30)) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 29)) << (31 - 29)) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 29; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 28)) << (31 - 28)) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 27)) << (31 - 27)) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 27; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 26)) << (31 - 26)) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 25)) << (31 - 25)) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 25; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 24)) << (31 - 24)) == value) { - *found = 7; - return 7; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 36) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 23)) << (31 - 23)) == value) { - *found = 8; - return 8; - } - tmp2 = tmp >> 23; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 40) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 22)) << (31 - 22)) == value) { - *found = 9; - return 9; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 44) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 21)) << (31 - 21)) == value) { - *found = 10; - return 10; - } - tmp2 = tmp >> 21; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 48) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 20)) << (31 - 20)) == value) { - *found = 11; - return 11; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 52) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 19)) << (31 - 19)) == value) { - *found = 12; - return 12; - } - tmp2 = tmp >> 19; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 56) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 18)) << (31 - 18)) == value) { - *found = 13; - return 13; - } - tmp2 = tmp >> 18; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 60) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 17)) << (31 - 17)) == value) { - *found = 14; - return 14; - } - tmp2 = tmp >> 17; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 64) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (31 - 16)) == value) { - *found = 15; - return 15; - } - /* remaining: 16 bits */ - return (62); -} - -static uint32_t -linsearch32_16(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t i; - uint32_t *in32 = (uint32_t *)in; - value -= base; - for (i = 0; i < 16; i++) { - if (in32[i] == value) { - *found = i; - return 0; - } - } - return 16 * sizeof(uint32_t); -} - -for_linsearchfunc_t for_linsearch16[33] = { - linsearch0_n, - linsearch1_16, - linsearch2_16, - linsearch3_16, - linsearch4_16, - linsearch5_16, - linsearch6_16, - linsearch7_16, - linsearch8_16, - linsearch9_16, - linsearch10_16, - linsearch11_16, - linsearch12_16, - linsearch13_16, - linsearch14_16, - linsearch15_16, - linsearch16_16, - linsearch17_16, - linsearch18_16, - linsearch19_16, - linsearch20_16, - linsearch21_16, - linsearch22_16, - linsearch23_16, - linsearch24_16, - linsearch25_16, - linsearch26_16, - linsearch27_16, - linsearch28_16, - linsearch29_16, - linsearch30_16, - linsearch31_16, - linsearch32_16 -}; - -static uint32_t -linsearch1_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 1) == value) { - *found = 0; - return 0; - } - if (((tmp >> 1) & 1) == value) { - *found = 1; - return 1; - } - if (((tmp >> 2) & 1) == value) { - *found = 2; - return 2; - } - if (((tmp >> 3) & 1) == value) { - *found = 3; - return 3; - } - if (((tmp >> 4) & 1) == value) { - *found = 4; - return 4; - } - if (((tmp >> 5) & 1) == value) { - *found = 5; - return 5; - } - if (((tmp >> 6) & 1) == value) { - *found = 6; - return 6; - } - if (((tmp >> 7) & 1) == value) { - *found = 7; - return 7; - } - /* remaining: 24 bits */ - return (1); -} - -static uint32_t -linsearch2_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 3) == value) { - *found = 0; - return 0; - } - if (((tmp >> 2) & 3) == value) { - *found = 1; - return 1; - } - if (((tmp >> 4) & 3) == value) { - *found = 2; - return 2; - } - if (((tmp >> 6) & 3) == value) { - *found = 3; - return 3; - } - if (((tmp >> 8) & 3) == value) { - *found = 4; - return 4; - } - if (((tmp >> 10) & 3) == value) { - *found = 5; - return 5; - } - if (((tmp >> 12) & 3) == value) { - *found = 6; - return 6; - } - if (((tmp >> 14) & 3) == value) { - *found = 7; - return 7; - } - /* remaining: 16 bits */ - return (2); -} - -static uint32_t -linsearch3_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 7) == value) { - *found = 0; - return 0; - } - if (((tmp >> 3) & 7) == value) { - *found = 1; - return 1; - } - if (((tmp >> 6) & 7) == value) { - *found = 2; - return 2; - } - if (((tmp >> 9) & 7) == value) { - *found = 3; - return 3; - } - if (((tmp >> 12) & 7) == value) { - *found = 4; - return 4; - } - if (((tmp >> 15) & 7) == value) { - *found = 5; - return 5; - } - if (((tmp >> 18) & 7) == value) { - *found = 6; - return 6; - } - if (((tmp >> 21) & 7) == value) { - *found = 7; - return 7; - } - /* remaining: 8 bits */ - return (3); -} - -static uint32_t -linsearch4_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 15) == value) { - *found = 0; - return 0; - } - if (((tmp >> 4) & 15) == value) { - *found = 1; - return 1; - } - if (((tmp >> 8) & 15) == value) { - *found = 2; - return 2; - } - if (((tmp >> 12) & 15) == value) { - *found = 3; - return 3; - } - if (((tmp >> 16) & 15) == value) { - *found = 4; - return 4; - } - if (((tmp >> 20) & 15) == value) { - *found = 5; - return 5; - } - if (((tmp >> 24) & 15) == value) { - *found = 6; - return 6; - } - if (((tmp >> 28) & 15) == value) { - *found = 7; - return 7; - } - /* remaining: 0 bits */ - return (4); -} - -static uint32_t -linsearch5_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 31) == value) { - *found = 0; - return 0; - } - if (((tmp >> 5) & 31) == value) { - *found = 1; - return 1; - } - if (((tmp >> 10) & 31) == value) { - *found = 2; - return 2; - } - if (((tmp >> 15) & 31) == value) { - *found = 3; - return 3; - } - if (((tmp >> 20) & 31) == value) { - *found = 4; - return 4; - } - if (((tmp >> 25) & 31) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 3)) << (5 - 3)) == value) { - *found = 6; - return 6; - } - if (((tmp >> 3) & 31) == value) { - *found = 7; - return 7; - } - /* remaining: 24 bits */ - return (5); -} - -static uint32_t -linsearch6_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 63) == value) { - *found = 0; - return 0; - } - if (((tmp >> 6) & 63) == value) { - *found = 1; - return 1; - } - if (((tmp >> 12) & 63) == value) { - *found = 2; - return 2; - } - if (((tmp >> 18) & 63) == value) { - *found = 3; - return 3; - } - if (((tmp >> 24) & 63) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (6 - 4)) == value) { - *found = 5; - return 5; - } - if (((tmp >> 4) & 63) == value) { - *found = 6; - return 6; - } - if (((tmp >> 10) & 63) == value) { - *found = 7; - return 7; - } - /* remaining: 16 bits */ - return (6); -} - -static uint32_t -linsearch7_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 127) == value) { - *found = 0; - return 0; - } - if (((tmp >> 7) & 127) == value) { - *found = 1; - return 1; - } - if (((tmp >> 14) & 127) == value) { - *found = 2; - return 2; - } - if (((tmp >> 21) & 127) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 3)) << (7 - 3)) == value) { - *found = 4; - return 4; - } - if (((tmp >> 3) & 127) == value) { - *found = 5; - return 5; - } - if (((tmp >> 10) & 127) == value) { - *found = 6; - return 6; - } - if (((tmp >> 17) & 127) == value) { - *found = 7; - return 7; - } - /* remaining: 8 bits */ - return (7); -} - -static uint32_t -linsearch8_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 255) == value) { - *found = 0; - return 0; - } - if (((tmp >> 8) & 255) == value) { - *found = 1; - return 1; - } - if (((tmp >> 16) & 255) == value) { - *found = 2; - return 2; - } - if (((tmp >> 24) & 255) == value) { - *found = 3; - return 3; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 8) */ - if (((tmp >> 0) & 255) == value) { - *found = 4; - return 4; - } - if (((tmp >> 8) & 255) == value) { - *found = 5; - return 5; - } - if (((tmp >> 16) & 255) == value) { - *found = 6; - return 6; - } - if (((tmp >> 24) & 255) == value) { - *found = 7; - return 7; - } - /* remaining: 0 bits */ - return (8); -} - -static uint32_t -linsearch9_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 511) == value) { - *found = 0; - return 0; - } - if (((tmp >> 9) & 511) == value) { - *found = 1; - return 1; - } - if (((tmp >> 18) & 511) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 27; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (9 - 4)) == value) { - *found = 3; - return 3; - } - if (((tmp >> 4) & 511) == value) { - *found = 4; - return 4; - } - if (((tmp >> 13) & 511) == value) { - *found = 5; - return 5; - } - if (((tmp >> 22) & 511) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 31; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (9 - 8)) == value) { - *found = 7; - return 7; - } - /* remaining: 24 bits */ - return (9); -} - -static uint32_t -linsearch10_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 1023) == value) { - *found = 0; - return 0; - } - if (((tmp >> 10) & 1023) == value) { - *found = 1; - return 1; - } - if (((tmp >> 20) & 1023) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (10 - 8)) == value) { - *found = 3; - return 3; - } - if (((tmp >> 8) & 1023) == value) { - *found = 4; - return 4; - } - if (((tmp >> 18) & 1023) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (10 - 6)) == value) { - *found = 6; - return 6; - } - if (((tmp >> 6) & 1023) == value) { - *found = 7; - return 7; - } - /* remaining: 16 bits */ - return (10); -} - -static uint32_t -linsearch11_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 2047) == value) { - *found = 0; - return 0; - } - if (((tmp >> 11) & 2047) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 1)) << (11 - 1)) == value) { - *found = 2; - return 2; - } - if (((tmp >> 1) & 2047) == value) { - *found = 3; - return 3; - } - if (((tmp >> 12) & 2047) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 23; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (11 - 2)) == value) { - *found = 5; - return 5; - } - if (((tmp >> 2) & 2047) == value) { - *found = 6; - return 6; - } - if (((tmp >> 13) & 2047) == value) { - *found = 7; - return 7; - } - /* remaining: 8 bits */ - return (11); -} - -static uint32_t -linsearch12_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 4095) == value) { - *found = 0; - return 0; - } - if (((tmp >> 12) & 4095) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (12 - 4)) == value) { - *found = 2; - return 2; - } - if (((tmp >> 4) & 4095) == value) { - *found = 3; - return 3; - } - if (((tmp >> 16) & 4095) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (12 - 8)) == value) { - *found = 5; - return 5; - } - if (((tmp >> 8) & 4095) == value) { - *found = 6; - return 6; - } - if (((tmp >> 20) & 4095) == value) { - *found = 7; - return 7; - } - /* remaining: 0 bits */ - return (12); -} - -static uint32_t -linsearch13_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 8191) == value) { - *found = 0; - return 0; - } - if (((tmp >> 13) & 8191) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 7)) << (13 - 7)) == value) { - *found = 2; - return 2; - } - if (((tmp >> 7) & 8191) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 1)) << (13 - 1)) == value) { - *found = 4; - return 4; - } - if (((tmp >> 1) & 8191) == value) { - *found = 5; - return 5; - } - if (((tmp >> 14) & 8191) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 27; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (13 - 8)) == value) { - *found = 7; - return 7; - } - /* remaining: 24 bits */ - return (13); -} - -static uint32_t -linsearch14_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 16383) == value) { - *found = 0; - return 0; - } - if (((tmp >> 14) & 16383) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (14 - 10)) == value) { - *found = 2; - return 2; - } - if (((tmp >> 10) & 16383) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (14 - 6)) == value) { - *found = 4; - return 4; - } - if (((tmp >> 6) & 16383) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (14 - 2)) == value) { - *found = 6; - return 6; - } - if (((tmp >> 2) & 16383) == value) { - *found = 7; - return 7; - } - /* remaining: 16 bits */ - return (14); -} - -static uint32_t -linsearch15_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 32767) == value) { - *found = 0; - return 0; - } - if (((tmp >> 15) & 32767) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 13)) << (15 - 13)) == value) { - *found = 2; - return 2; - } - if (((tmp >> 13) & 32767) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 11)) << (15 - 11)) == value) { - *found = 4; - return 4; - } - if (((tmp >> 11) & 32767) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 9)) << (15 - 9)) == value) { - *found = 6; - return 6; - } - if (((tmp >> 9) & 32767) == value) { - *found = 7; - return 7; - } - /* remaining: 8 bits */ - return (15); -} - -static uint32_t -linsearch16_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 65535) == value) { - *found = 0; - return 0; - } - if (((tmp >> 16) & 65535) == value) { - *found = 1; - return 1; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 8) */ - if (((tmp >> 0) & 65535) == value) { - *found = 2; - return 2; - } - if (((tmp >> 16) & 65535) == value) { - *found = 3; - return 3; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 12) */ - if (((tmp >> 0) & 65535) == value) { - *found = 4; - return 4; - } - if (((tmp >> 16) & 65535) == value) { - *found = 5; - return 5; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 16) */ - if (((tmp >> 0) & 65535) == value) { - *found = 6; - return 6; - } - if (((tmp >> 16) & 65535) == value) { - *found = 7; - return 7; - } - /* remaining: 0 bits */ - return (16); -} - -static uint32_t -linsearch17_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 131071) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 17; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (17 - 2)) == value) { - *found = 1; - return 1; - } - if (((tmp >> 2) & 131071) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 19; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (17 - 4)) == value) { - *found = 3; - return 3; - } - if (((tmp >> 4) & 131071) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 21; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (17 - 6)) == value) { - *found = 5; - return 5; - } - if (((tmp >> 6) & 131071) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 23; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (17 - 8)) == value) { - *found = 7; - return 7; - } - /* remaining: 24 bits */ - return (17); -} - -static uint32_t -linsearch18_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 262143) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 18; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (18 - 4)) == value) { - *found = 1; - return 1; - } - if (((tmp >> 4) & 262143) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (18 - 8)) == value) { - *found = 3; - return 3; - } - if (((tmp >> 8) & 262143) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (18 - 12)) == value) { - *found = 5; - return 5; - } - if (((tmp >> 12) & 262143) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (18 - 16)) == value) { - *found = 7; - return 7; - } - /* remaining: 16 bits */ - return (18); -} - -static uint32_t -linsearch19_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 524287) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 19; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (19 - 6)) == value) { - *found = 1; - return 1; - } - if (((tmp >> 6) & 524287) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 25; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (19 - 12)) == value) { - *found = 3; - return 3; - } - if (((tmp >> 12) & 524287) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 31; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 18)) << (19 - 18)) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 18; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 5)) << (19 - 5)) == value) { - *found = 6; - return 6; - } - if (((tmp >> 5) & 524287) == value) { - *found = 7; - return 7; - } - /* remaining: 8 bits */ - return (19); -} - -static uint32_t -linsearch20_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 1048575) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (20 - 8)) == value) { - *found = 1; - return 1; - } - if (((tmp >> 8) & 1048575) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (20 - 16)) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (20 - 4)) == value) { - *found = 4; - return 4; - } - if (((tmp >> 4) & 1048575) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (20 - 12)) == value) { - *found = 6; - return 6; - } - if (((tmp >> 12) & 1048575) == value) { - *found = 7; - return 7; - } - /* remaining: 0 bits */ - return (20); -} - -static uint32_t -linsearch21_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 2097151) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 21; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (21 - 10)) == value) { - *found = 1; - return 1; - } - if (((tmp >> 10) & 2097151) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 31; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 20)) << (21 - 20)) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 9)) << (21 - 9)) == value) { - *found = 4; - return 4; - } - if (((tmp >> 9) & 2097151) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 19)) << (21 - 19)) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 19; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (21 - 8)) == value) { - *found = 7; - return 7; - } - /* remaining: 24 bits */ - return (21); -} - -static uint32_t -linsearch22_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 4194303) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (22 - 12)) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 12; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (22 - 2)) == value) { - *found = 2; - return 2; - } - if (((tmp >> 2) & 4194303) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 14)) << (22 - 14)) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 14; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (22 - 4)) == value) { - *found = 5; - return 5; - } - if (((tmp >> 4) & 4194303) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (22 - 16)) == value) { - *found = 7; - return 7; - } - /* remaining: 16 bits */ - return (22); -} - -static uint32_t -linsearch23_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 8388607) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 23; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 14)) << (23 - 14)) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 14; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 5)) << (23 - 5)) == value) { - *found = 2; - return 2; - } - if (((tmp >> 5) & 8388607) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 19)) << (23 - 19)) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 19; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (23 - 10)) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 10; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 1)) << (23 - 1)) == value) { - *found = 6; - return 6; - } - if (((tmp >> 1) & 8388607) == value) { - *found = 7; - return 7; - } - /* remaining: 8 bits */ - return (23); -} - -static uint32_t -linsearch24_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 16777215) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (24 - 16)) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (24 - 8)) == value) { - *found = 2; - return 2; - } - if (((tmp >> 8) & 16777215) == value) { - *found = 3; - return 3; - } - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 16) */ - if (((tmp >> 0) & 16777215) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (24 - 16)) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (24 - 8)) == value) { - *found = 6; - return 6; - } - if (((tmp >> 8) & 16777215) == value) { - *found = 7; - return 7; - } - /* remaining: 0 bits */ - return (24); -} - -static uint32_t -linsearch25_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 33554431) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 25; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 18)) << (25 - 18)) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 18; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 11)) << (25 - 11)) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 11; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (25 - 4)) == value) { - *found = 3; - return 3; - } - if (((tmp >> 4) & 33554431) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 29; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 22)) << (25 - 22)) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 15)) << (25 - 15)) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 15; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (25 - 8)) == value) { - *found = 7; - return 7; - } - /* remaining: 24 bits */ - return (25); -} - -static uint32_t -linsearch26_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 67108863) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 20)) << (26 - 20)) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 14)) << (26 - 14)) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 14; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (26 - 8)) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 8; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (26 - 2)) == value) { - *found = 4; - return 4; - } - if (((tmp >> 2) & 67108863) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 22)) << (26 - 22)) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (26 - 16)) == value) { - *found = 7; - return 7; - } - /* remaining: 16 bits */ - return (26); -} - -static uint32_t -linsearch27_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 134217727) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 27; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 22)) << (27 - 22)) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 17)) << (27 - 17)) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 17; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (27 - 12)) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 12; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 7)) << (27 - 7)) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 7; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (27 - 2)) == value) { - *found = 5; - return 5; - } - if (((tmp >> 2) & 134217727) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 29; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 24)) << (27 - 24)) == value) { - *found = 7; - return 7; - } - /* remaining: 8 bits */ - return (27); -} - -static uint32_t -linsearch28_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 268435455) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 24)) << (28 - 24)) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 20)) << (28 - 20)) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (28 - 16)) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (28 - 12)) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 12; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (28 - 8)) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 8; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (28 - 4)) == value) { - *found = 6; - return 6; - } - if (((tmp >> 4) & 268435455) == value) { - *found = 7; - return 7; - } - /* remaining: 0 bits */ - return (28); -} - -static uint32_t -linsearch29_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 536870911) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 29; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 26)) << (29 - 26)) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 23)) << (29 - 23)) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 23; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 20)) << (29 - 20)) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 17)) << (29 - 17)) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 17; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 14)) << (29 - 14)) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 14; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 11)) << (29 - 11)) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 11; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (29 - 8)) == value) { - *found = 7; - return 7; - } - /* remaining: 24 bits */ - return (29); -} - -static uint32_t -linsearch30_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 1073741823) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 28)) << (30 - 28)) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 26)) << (30 - 26)) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 24)) << (30 - 24)) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 22)) << (30 - 22)) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 20)) << (30 - 20)) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 18)) << (30 - 18)) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 18; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (30 - 16)) == value) { - *found = 7; - return 7; - } - /* remaining: 16 bits */ - return (30); -} - -static uint32_t -linsearch31_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t tmp, tmp2; - value -= base; - (void)tmp2; - tmp = *(uint32_t *)in; - /* consumed: 4 bytes (total: 4) */ - if (((tmp >> 0) & 2147483647) == value) { - *found = 0; - return 0; - } - tmp2 = tmp >> 31; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 8) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 30)) << (31 - 30)) == value) { - *found = 1; - return 1; - } - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 12) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 29)) << (31 - 29)) == value) { - *found = 2; - return 2; - } - tmp2 = tmp >> 29; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 16) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 28)) << (31 - 28)) == value) { - *found = 3; - return 3; - } - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 20) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 27)) << (31 - 27)) == value) { - *found = 4; - return 4; - } - tmp2 = tmp >> 27; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 24) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 26)) << (31 - 26)) == value) { - *found = 5; - return 5; - } - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 28) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 25)) << (31 - 25)) == value) { - *found = 6; - return 6; - } - tmp2 = tmp >> 25; - in += sizeof(uint32_t); - /* consumed: 4 bytes (total: 32) */ - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 24)) << (31 - 24)) == value) { - *found = 7; - return 7; - } - /* remaining: 8 bits */ - return (31); -} - -static uint32_t -linsearch32_8(uint32_t base, const uint8_t *in, uint32_t value, int *found) { - uint32_t i; - uint32_t *in32 = (uint32_t *)in; - value -= base; - for (i = 0; i < 8; i++) { - if (in32[i] == value) { - *found = i; - return 0; - } - } - return 8 * sizeof(uint32_t); -} - -for_linsearchfunc_t for_linsearch8[33] = { - linsearch0_n, - linsearch1_8, - linsearch2_8, - linsearch3_8, - linsearch4_8, - linsearch5_8, - linsearch6_8, - linsearch7_8, - linsearch8_8, - linsearch9_8, - linsearch10_8, - linsearch11_8, - linsearch12_8, - linsearch13_8, - linsearch14_8, - linsearch15_8, - linsearch16_8, - linsearch17_8, - linsearch18_8, - linsearch19_8, - linsearch20_8, - linsearch21_8, - linsearch22_8, - linsearch23_8, - linsearch24_8, - linsearch25_8, - linsearch26_8, - linsearch27_8, - linsearch28_8, - linsearch29_8, - linsearch30_8, - linsearch31_8, - linsearch32_8 -}; - -static uint32_t -linsearch1_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { - uint32_t tmp, tmp2; - (void)tmp2; - if (length == 0) - return 0; - value -= base; - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 1)) { - *found = 0; - return 0; - } - if (length == 1) - goto bail; - if (value == ((tmp >> 1) & 1)) { - *found = 1; - return 1; - } - if (length == 2) - goto bail; - if (value == ((tmp >> 2) & 1)) { - *found = 2; - return 2; - } - if (length == 3) - goto bail; - if (value == ((tmp >> 3) & 1)) { - *found = 3; - return 3; - } - if (length == 4) - goto bail; - if (value == ((tmp >> 4) & 1)) { - *found = 4; - return 4; - } - if (length == 5) - goto bail; - if (value == ((tmp >> 5) & 1)) { - *found = 5; - return 5; - } - if (length == 6) - goto bail; - if (value == ((tmp >> 6) & 1)) { - *found = 6; - return 6; - } - if (length == 7) - goto bail; - if (value == ((tmp >> 7) & 1)) { - *found = 7; - return 7; - } - if (length == 8) - goto bail; -bail: - return ((length * 1) + 7) / 8; -} - -static uint32_t -linsearch2_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { - uint32_t tmp, tmp2; - (void)tmp2; - if (length == 0) - return 0; - value -= base; - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 3)) { - *found = 0; - return 0; - } - if (length == 1) - goto bail; - if (value == ((tmp >> 2) & 3)) { - *found = 1; - return 1; - } - if (length == 2) - goto bail; - if (value == ((tmp >> 4) & 3)) { - *found = 2; - return 2; - } - if (length == 3) - goto bail; - if (value == ((tmp >> 6) & 3)) { - *found = 3; - return 3; - } - if (length == 4) - goto bail; - if (value == ((tmp >> 8) & 3)) { - *found = 4; - return 4; - } - if (length == 5) - goto bail; - if (value == ((tmp >> 10) & 3)) { - *found = 5; - return 5; - } - if (length == 6) - goto bail; - if (value == ((tmp >> 12) & 3)) { - *found = 6; - return 6; - } - if (length == 7) - goto bail; - if (value == ((tmp >> 14) & 3)) { - *found = 7; - return 7; - } - if (length == 8) - goto bail; -bail: - return ((length * 2) + 7) / 8; -} - -static uint32_t -linsearch3_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { - uint32_t tmp, tmp2; - (void)tmp2; - if (length == 0) - return 0; - value -= base; - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 7)) { - *found = 0; - return 0; - } - if (length == 1) - goto bail; - if (value == ((tmp >> 3) & 7)) { - *found = 1; - return 1; - } - if (length == 2) - goto bail; - if (value == ((tmp >> 6) & 7)) { - *found = 2; - return 2; - } - if (length == 3) - goto bail; - if (value == ((tmp >> 9) & 7)) { - *found = 3; - return 3; - } - if (length == 4) - goto bail; - if (value == ((tmp >> 12) & 7)) { - *found = 4; - return 4; - } - if (length == 5) - goto bail; - if (value == ((tmp >> 15) & 7)) { - *found = 5; - return 5; - } - if (length == 6) - goto bail; - if (value == ((tmp >> 18) & 7)) { - *found = 6; - return 6; - } - if (length == 7) - goto bail; - if (value == ((tmp >> 21) & 7)) { - *found = 7; - return 7; - } - if (length == 8) - goto bail; -bail: - return ((length * 3) + 7) / 8; -} - -static uint32_t -linsearch4_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { - uint32_t tmp, tmp2; - (void)tmp2; - if (length == 0) - return 0; - value -= base; - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 15)) { - *found = 0; - return 0; - } - if (length == 1) - goto bail; - if (value == ((tmp >> 4) & 15)) { - *found = 1; - return 1; - } - if (length == 2) - goto bail; - if (value == ((tmp >> 8) & 15)) { - *found = 2; - return 2; - } - if (length == 3) - goto bail; - if (value == ((tmp >> 12) & 15)) { - *found = 3; - return 3; - } - if (length == 4) - goto bail; - if (value == ((tmp >> 16) & 15)) { - *found = 4; - return 4; - } - if (length == 5) - goto bail; - if (value == ((tmp >> 20) & 15)) { - *found = 5; - return 5; - } - if (length == 6) - goto bail; - if (value == ((tmp >> 24) & 15)) { - *found = 6; - return 6; - } - if (length == 7) - goto bail; - if (value == ((tmp >> 28) & 15)) { - *found = 7; - return 7; - } - if (length == 8) - goto bail; -bail: - return ((length * 4) + 7) / 8; -} - -static uint32_t -linsearch5_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { - uint32_t tmp, tmp2; - (void)tmp2; - if (length == 0) - return 0; - value -= base; - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 31)) { - *found = 0; - return 0; - } - if (length == 1) - goto bail; - if (value == ((tmp >> 5) & 31)) { - *found = 1; - return 1; - } - if (length == 2) - goto bail; - if (value == ((tmp >> 10) & 31)) { - *found = 2; - return 2; - } - if (length == 3) - goto bail; - if (value == ((tmp >> 15) & 31)) { - *found = 3; - return 3; - } - if (length == 4) - goto bail; - if (value == ((tmp >> 20) & 31)) { - *found = 4; - return 4; - } - if (length == 5) - goto bail; - if (value == ((tmp >> 25) & 31)) { - *found = 5; - return 5; - } - if (length == 6) - goto bail; - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 3)) << (5 - 3)) == value) { - *found = 6; - return 6; - } - if (length == 7) - goto bail; - if (value == ((tmp >> 3) & 31)) { - *found = 7; - return 7; - } - if (length == 8) - goto bail; -bail: - return ((length * 5) + 7) / 8; -} - -static uint32_t -linsearch6_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { - uint32_t tmp, tmp2; - (void)tmp2; - if (length == 0) - return 0; - value -= base; - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 63)) { - *found = 0; - return 0; - } - if (length == 1) - goto bail; - if (value == ((tmp >> 6) & 63)) { - *found = 1; - return 1; - } - if (length == 2) - goto bail; - if (value == ((tmp >> 12) & 63)) { - *found = 2; - return 2; - } - if (length == 3) - goto bail; - if (value == ((tmp >> 18) & 63)) { - *found = 3; - return 3; - } - if (length == 4) - goto bail; - if (value == ((tmp >> 24) & 63)) { - *found = 4; - return 4; - } - if (length == 5) - goto bail; - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (6 - 4)) == value) { - *found = 5; - return 5; - } - if (length == 6) - goto bail; - if (value == ((tmp >> 4) & 63)) { - *found = 6; - return 6; - } - if (length == 7) - goto bail; - if (value == ((tmp >> 10) & 63)) { - *found = 7; - return 7; - } - if (length == 8) - goto bail; -bail: - return ((length * 6) + 7) / 8; -} - -static uint32_t -linsearch7_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { - uint32_t tmp, tmp2; - (void)tmp2; - if (length == 0) - return 0; - value -= base; - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 127)) { - *found = 0; - return 0; - } - if (length == 1) - goto bail; - if (value == ((tmp >> 7) & 127)) { - *found = 1; - return 1; - } - if (length == 2) - goto bail; - if (value == ((tmp >> 14) & 127)) { - *found = 2; - return 2; - } - if (length == 3) - goto bail; - if (value == ((tmp >> 21) & 127)) { - *found = 3; - return 3; - } - if (length == 4) - goto bail; - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 3)) << (7 - 3)) == value) { - *found = 4; - return 4; - } - if (length == 5) - goto bail; - if (value == ((tmp >> 3) & 127)) { - *found = 5; - return 5; - } - if (length == 6) - goto bail; - if (value == ((tmp >> 10) & 127)) { - *found = 6; - return 6; - } - if (length == 7) - goto bail; - if (value == ((tmp >> 17) & 127)) { - *found = 7; - return 7; - } - if (length == 8) - goto bail; -bail: - return ((length * 7) + 7) / 8; -} - -static uint32_t -linsearch8_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { - uint32_t tmp, tmp2; - (void)tmp2; - if (length == 0) - return 0; - value -= base; - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 255)) { - *found = 0; - return 0; - } - if (length == 1) - goto bail; - if (value == ((tmp >> 8) & 255)) { - *found = 1; - return 1; - } - if (length == 2) - goto bail; - if (value == ((tmp >> 16) & 255)) { - *found = 2; - return 2; - } - if (length == 3) - goto bail; - if (value == ((tmp >> 24) & 255)) { - *found = 3; - return 3; - } - if (length == 4) - goto bail; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 255)) { - *found = 4; - return 4; - } - if (length == 5) - goto bail; - if (value == ((tmp >> 8) & 255)) { - *found = 5; - return 5; - } - if (length == 6) - goto bail; - if (value == ((tmp >> 16) & 255)) { - *found = 6; - return 6; - } - if (length == 7) - goto bail; - if (value == ((tmp >> 24) & 255)) { - *found = 7; - return 7; - } - if (length == 8) - goto bail; -bail: - return ((length * 8) + 7) / 8; -} - -static uint32_t -linsearch9_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { - uint32_t tmp, tmp2; - (void)tmp2; - if (length == 0) - return 0; - value -= base; - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 511)) { - *found = 0; - return 0; - } - if (length == 1) - goto bail; - if (value == ((tmp >> 9) & 511)) { - *found = 1; - return 1; - } - if (length == 2) - goto bail; - if (value == ((tmp >> 18) & 511)) { - *found = 2; - return 2; - } - if (length == 3) - goto bail; - tmp2 = tmp >> 27; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (9 - 4)) == value) { - *found = 3; - return 3; - } - if (length == 4) - goto bail; - if (value == ((tmp >> 4) & 511)) { - *found = 4; - return 4; - } - if (length == 5) - goto bail; - if (value == ((tmp >> 13) & 511)) { - *found = 5; - return 5; - } - if (length == 6) - goto bail; - if (value == ((tmp >> 22) & 511)) { - *found = 6; - return 6; - } - if (length == 7) - goto bail; - tmp2 = tmp >> 31; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (9 - 8)) == value) { - *found = 7; - return 7; - } - if (length == 8) - goto bail; -bail: - return ((length * 9) + 7) / 8; -} - -static uint32_t -linsearch10_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { - uint32_t tmp, tmp2; - (void)tmp2; - if (length == 0) - return 0; - value -= base; - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 1023)) { - *found = 0; - return 0; - } - if (length == 1) - goto bail; - if (value == ((tmp >> 10) & 1023)) { - *found = 1; - return 1; - } - if (length == 2) - goto bail; - if (value == ((tmp >> 20) & 1023)) { - *found = 2; - return 2; - } - if (length == 3) - goto bail; - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (10 - 8)) == value) { - *found = 3; - return 3; - } - if (length == 4) - goto bail; - if (value == ((tmp >> 8) & 1023)) { - *found = 4; - return 4; - } - if (length == 5) - goto bail; - if (value == ((tmp >> 18) & 1023)) { - *found = 5; - return 5; - } - if (length == 6) - goto bail; - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (10 - 6)) == value) { - *found = 6; - return 6; - } - if (length == 7) - goto bail; - if (value == ((tmp >> 6) & 1023)) { - *found = 7; - return 7; - } - if (length == 8) - goto bail; -bail: - return ((length * 10) + 7) / 8; -} - -static uint32_t -linsearch11_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { - uint32_t tmp, tmp2; - (void)tmp2; - if (length == 0) - return 0; - value -= base; - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 2047)) { - *found = 0; - return 0; - } - if (length == 1) - goto bail; - if (value == ((tmp >> 11) & 2047)) { - *found = 1; - return 1; - } - if (length == 2) - goto bail; - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 1)) << (11 - 1)) == value) { - *found = 2; - return 2; - } - if (length == 3) - goto bail; - if (value == ((tmp >> 1) & 2047)) { - *found = 3; - return 3; - } - if (length == 4) - goto bail; - if (value == ((tmp >> 12) & 2047)) { - *found = 4; - return 4; - } - if (length == 5) - goto bail; - tmp2 = tmp >> 23; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (11 - 2)) == value) { - *found = 5; - return 5; - } - if (length == 6) - goto bail; - if (value == ((tmp >> 2) & 2047)) { - *found = 6; - return 6; - } - if (length == 7) - goto bail; - if (value == ((tmp >> 13) & 2047)) { - *found = 7; - return 7; - } - if (length == 8) - goto bail; -bail: - return ((length * 11) + 7) / 8; -} - -static uint32_t -linsearch12_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { - uint32_t tmp, tmp2; - (void)tmp2; - if (length == 0) - return 0; - value -= base; - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 4095)) { - *found = 0; - return 0; - } - if (length == 1) - goto bail; - if (value == ((tmp >> 12) & 4095)) { - *found = 1; - return 1; - } - if (length == 2) - goto bail; - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (12 - 4)) == value) { - *found = 2; - return 2; - } - if (length == 3) - goto bail; - if (value == ((tmp >> 4) & 4095)) { - *found = 3; - return 3; - } - if (length == 4) - goto bail; - if (value == ((tmp >> 16) & 4095)) { - *found = 4; - return 4; - } - if (length == 5) - goto bail; - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (12 - 8)) == value) { - *found = 5; - return 5; - } - if (length == 6) - goto bail; - if (value == ((tmp >> 8) & 4095)) { - *found = 6; - return 6; - } - if (length == 7) - goto bail; - if (value == ((tmp >> 20) & 4095)) { - *found = 7; - return 7; - } - if (length == 8) - goto bail; -bail: - return ((length * 12) + 7) / 8; -} - -static uint32_t -linsearch13_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { - uint32_t tmp, tmp2; - (void)tmp2; - if (length == 0) - return 0; - value -= base; - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 8191)) { - *found = 0; - return 0; - } - if (length == 1) - goto bail; - if (value == ((tmp >> 13) & 8191)) { - *found = 1; - return 1; - } - if (length == 2) - goto bail; - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 7)) << (13 - 7)) == value) { - *found = 2; - return 2; - } - if (length == 3) - goto bail; - if (value == ((tmp >> 7) & 8191)) { - *found = 3; - return 3; - } - if (length == 4) - goto bail; - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 1)) << (13 - 1)) == value) { - *found = 4; - return 4; - } - if (length == 5) - goto bail; - if (value == ((tmp >> 1) & 8191)) { - *found = 5; - return 5; - } - if (length == 6) - goto bail; - if (value == ((tmp >> 14) & 8191)) { - *found = 6; - return 6; - } - if (length == 7) - goto bail; - tmp2 = tmp >> 27; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (13 - 8)) == value) { - *found = 7; - return 7; - } - if (length == 8) - goto bail; -bail: - return ((length * 13) + 7) / 8; -} - -static uint32_t -linsearch14_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { - uint32_t tmp, tmp2; - (void)tmp2; - if (length == 0) - return 0; - value -= base; - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 16383)) { - *found = 0; - return 0; - } - if (length == 1) - goto bail; - if (value == ((tmp >> 14) & 16383)) { - *found = 1; - return 1; - } - if (length == 2) - goto bail; - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (14 - 10)) == value) { - *found = 2; - return 2; - } - if (length == 3) - goto bail; - if (value == ((tmp >> 10) & 16383)) { - *found = 3; - return 3; - } - if (length == 4) - goto bail; - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (14 - 6)) == value) { - *found = 4; - return 4; - } - if (length == 5) - goto bail; - if (value == ((tmp >> 6) & 16383)) { - *found = 5; - return 5; - } - if (length == 6) - goto bail; - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (14 - 2)) == value) { - *found = 6; - return 6; - } - if (length == 7) - goto bail; - if (value == ((tmp >> 2) & 16383)) { - *found = 7; - return 7; - } - if (length == 8) - goto bail; -bail: - return ((length * 14) + 7) / 8; -} - -static uint32_t -linsearch15_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { - uint32_t tmp, tmp2; - (void)tmp2; - if (length == 0) - return 0; - value -= base; - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 32767)) { - *found = 0; - return 0; - } - if (length == 1) - goto bail; - if (value == ((tmp >> 15) & 32767)) { - *found = 1; - return 1; - } - if (length == 2) - goto bail; - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 13)) << (15 - 13)) == value) { - *found = 2; - return 2; - } - if (length == 3) - goto bail; - if (value == ((tmp >> 13) & 32767)) { - *found = 3; - return 3; - } - if (length == 4) - goto bail; - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 11)) << (15 - 11)) == value) { - *found = 4; - return 4; - } - if (length == 5) - goto bail; - if (value == ((tmp >> 11) & 32767)) { - *found = 5; - return 5; - } - if (length == 6) - goto bail; - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 9)) << (15 - 9)) == value) { - *found = 6; - return 6; - } - if (length == 7) - goto bail; - if (value == ((tmp >> 9) & 32767)) { - *found = 7; - return 7; - } - if (length == 8) - goto bail; -bail: - return ((length * 15) + 7) / 8; -} - -static uint32_t -linsearch16_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { - uint32_t tmp, tmp2; - (void)tmp2; - if (length == 0) - return 0; - value -= base; - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 65535)) { - *found = 0; - return 0; - } - if (length == 1) - goto bail; - if (value == ((tmp >> 16) & 65535)) { - *found = 1; - return 1; - } - if (length == 2) - goto bail; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 65535)) { - *found = 2; - return 2; - } - if (length == 3) - goto bail; - if (value == ((tmp >> 16) & 65535)) { - *found = 3; - return 3; - } - if (length == 4) - goto bail; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 65535)) { - *found = 4; - return 4; - } - if (length == 5) - goto bail; - if (value == ((tmp >> 16) & 65535)) { - *found = 5; - return 5; - } - if (length == 6) - goto bail; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 65535)) { - *found = 6; - return 6; - } - if (length == 7) - goto bail; - if (value == ((tmp >> 16) & 65535)) { - *found = 7; - return 7; - } - if (length == 8) - goto bail; -bail: - return ((length * 16) + 7) / 8; -} - -static uint32_t -linsearch17_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { - uint32_t tmp, tmp2; - (void)tmp2; - if (length == 0) - return 0; - value -= base; - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 131071)) { - *found = 0; - return 0; - } - if (length == 1) - goto bail; - tmp2 = tmp >> 17; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (17 - 2)) == value) { - *found = 1; - return 1; - } - if (length == 2) - goto bail; - if (value == ((tmp >> 2) & 131071)) { - *found = 2; - return 2; - } - if (length == 3) - goto bail; - tmp2 = tmp >> 19; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (17 - 4)) == value) { - *found = 3; - return 3; - } - if (length == 4) - goto bail; - if (value == ((tmp >> 4) & 131071)) { - *found = 4; - return 4; - } - if (length == 5) - goto bail; - tmp2 = tmp >> 21; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (17 - 6)) == value) { - *found = 5; - return 5; - } - if (length == 6) - goto bail; - if (value == ((tmp >> 6) & 131071)) { - *found = 6; - return 6; - } - if (length == 7) - goto bail; - tmp2 = tmp >> 23; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (17 - 8)) == value) { - *found = 7; - return 7; - } - if (length == 8) - goto bail; -bail: - return ((length * 17) + 7) / 8; -} - -static uint32_t -linsearch18_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { - uint32_t tmp, tmp2; - (void)tmp2; - if (length == 0) - return 0; - value -= base; - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 262143)) { - *found = 0; - return 0; - } - if (length == 1) - goto bail; - tmp2 = tmp >> 18; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (18 - 4)) == value) { - *found = 1; - return 1; - } - if (length == 2) - goto bail; - if (value == ((tmp >> 4) & 262143)) { - *found = 2; - return 2; - } - if (length == 3) - goto bail; - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (18 - 8)) == value) { - *found = 3; - return 3; - } - if (length == 4) - goto bail; - if (value == ((tmp >> 8) & 262143)) { - *found = 4; - return 4; - } - if (length == 5) - goto bail; - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (18 - 12)) == value) { - *found = 5; - return 5; - } - if (length == 6) - goto bail; - if (value == ((tmp >> 12) & 262143)) { - *found = 6; - return 6; - } - if (length == 7) - goto bail; - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (18 - 16)) == value) { - *found = 7; - return 7; - } - if (length == 8) - goto bail; -bail: - return ((length * 18) + 7) / 8; -} - -static uint32_t -linsearch19_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { - uint32_t tmp, tmp2; - (void)tmp2; - if (length == 0) - return 0; - value -= base; - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 524287)) { - *found = 0; - return 0; - } - if (length == 1) - goto bail; - tmp2 = tmp >> 19; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 6)) << (19 - 6)) == value) { - *found = 1; - return 1; - } - if (length == 2) - goto bail; - if (value == ((tmp >> 6) & 524287)) { - *found = 2; - return 2; - } - if (length == 3) - goto bail; - tmp2 = tmp >> 25; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (19 - 12)) == value) { - *found = 3; - return 3; - } - if (length == 4) - goto bail; - if (value == ((tmp >> 12) & 524287)) { - *found = 4; - return 4; - } - if (length == 5) - goto bail; - tmp2 = tmp >> 31; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 18)) << (19 - 18)) == value) { - *found = 5; - return 5; - } - if (length == 6) - goto bail; - tmp2 = tmp >> 18; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 5)) << (19 - 5)) == value) { - *found = 6; - return 6; - } - if (length == 7) - goto bail; - if (value == ((tmp >> 5) & 524287)) { - *found = 7; - return 7; - } - if (length == 8) - goto bail; -bail: - return ((length * 19) + 7) / 8; -} - -static uint32_t -linsearch20_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { - uint32_t tmp, tmp2; - (void)tmp2; - if (length == 0) - return 0; - value -= base; - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 1048575)) { - *found = 0; - return 0; - } - if (length == 1) - goto bail; - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (20 - 8)) == value) { - *found = 1; - return 1; - } - if (length == 2) - goto bail; - if (value == ((tmp >> 8) & 1048575)) { - *found = 2; - return 2; - } - if (length == 3) - goto bail; - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (20 - 16)) == value) { - *found = 3; - return 3; - } - if (length == 4) - goto bail; - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (20 - 4)) == value) { - *found = 4; - return 4; - } - if (length == 5) - goto bail; - if (value == ((tmp >> 4) & 1048575)) { - *found = 5; - return 5; - } - if (length == 6) - goto bail; - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (20 - 12)) == value) { - *found = 6; - return 6; - } - if (length == 7) - goto bail; - if (value == ((tmp >> 12) & 1048575)) { - *found = 7; - return 7; - } - if (length == 8) - goto bail; -bail: - return ((length * 20) + 7) / 8; -} - -static uint32_t -linsearch21_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { - uint32_t tmp, tmp2; - (void)tmp2; - if (length == 0) - return 0; - value -= base; - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 2097151)) { - *found = 0; - return 0; - } - if (length == 1) - goto bail; - tmp2 = tmp >> 21; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (21 - 10)) == value) { - *found = 1; - return 1; - } - if (length == 2) - goto bail; - if (value == ((tmp >> 10) & 2097151)) { - *found = 2; - return 2; - } - if (length == 3) - goto bail; - tmp2 = tmp >> 31; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 20)) << (21 - 20)) == value) { - *found = 3; - return 3; - } - if (length == 4) - goto bail; - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 9)) << (21 - 9)) == value) { - *found = 4; - return 4; - } - if (length == 5) - goto bail; - if (value == ((tmp >> 9) & 2097151)) { - *found = 5; - return 5; - } - if (length == 6) - goto bail; - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 19)) << (21 - 19)) == value) { - *found = 6; - return 6; - } - if (length == 7) - goto bail; - tmp2 = tmp >> 19; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (21 - 8)) == value) { - *found = 7; - return 7; - } - if (length == 8) - goto bail; -bail: - return ((length * 21) + 7) / 8; -} - -static uint32_t -linsearch22_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { - uint32_t tmp, tmp2; - (void)tmp2; - if (length == 0) - return 0; - value -= base; - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 4194303)) { - *found = 0; - return 0; - } - if (length == 1) - goto bail; - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (22 - 12)) == value) { - *found = 1; - return 1; - } - if (length == 2) - goto bail; - tmp2 = tmp >> 12; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (22 - 2)) == value) { - *found = 2; - return 2; - } - if (length == 3) - goto bail; - if (value == ((tmp >> 2) & 4194303)) { - *found = 3; - return 3; - } - if (length == 4) - goto bail; - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 14)) << (22 - 14)) == value) { - *found = 4; - return 4; - } - if (length == 5) - goto bail; - tmp2 = tmp >> 14; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (22 - 4)) == value) { - *found = 5; - return 5; - } - if (length == 6) - goto bail; - if (value == ((tmp >> 4) & 4194303)) { - *found = 6; - return 6; - } - if (length == 7) - goto bail; - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (22 - 16)) == value) { - *found = 7; - return 7; - } - if (length == 8) - goto bail; -bail: - return ((length * 22) + 7) / 8; -} - -static uint32_t -linsearch23_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { - uint32_t tmp, tmp2; - (void)tmp2; - if (length == 0) - return 0; - value -= base; - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 8388607)) { - *found = 0; - return 0; - } - if (length == 1) - goto bail; - tmp2 = tmp >> 23; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 14)) << (23 - 14)) == value) { - *found = 1; - return 1; - } - if (length == 2) - goto bail; - tmp2 = tmp >> 14; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 5)) << (23 - 5)) == value) { - *found = 2; - return 2; - } - if (length == 3) - goto bail; - if (value == ((tmp >> 5) & 8388607)) { - *found = 3; - return 3; - } - if (length == 4) - goto bail; - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 19)) << (23 - 19)) == value) { - *found = 4; - return 4; - } - if (length == 5) - goto bail; - tmp2 = tmp >> 19; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 10)) << (23 - 10)) == value) { - *found = 5; - return 5; - } - if (length == 6) - goto bail; - tmp2 = tmp >> 10; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 1)) << (23 - 1)) == value) { - *found = 6; - return 6; - } - if (length == 7) - goto bail; - if (value == ((tmp >> 1) & 8388607)) { - *found = 7; - return 7; - } - if (length == 8) - goto bail; -bail: - return ((length * 23) + 7) / 8; -} - -static uint32_t -linsearch24_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { - uint32_t tmp, tmp2; - (void)tmp2; - if (length == 0) - return 0; - value -= base; - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 16777215)) { - *found = 0; - return 0; - } - if (length == 1) - goto bail; - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (24 - 16)) == value) { - *found = 1; - return 1; - } - if (length == 2) - goto bail; - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (24 - 8)) == value) { - *found = 2; - return 2; - } - if (length == 3) - goto bail; - if (value == ((tmp >> 8) & 16777215)) { - *found = 3; - return 3; - } - if (length == 4) - goto bail; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 16777215)) { - *found = 4; - return 4; - } - if (length == 5) - goto bail; - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (24 - 16)) == value) { - *found = 5; - return 5; - } - if (length == 6) - goto bail; - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (24 - 8)) == value) { - *found = 6; - return 6; - } - if (length == 7) - goto bail; - if (value == ((tmp >> 8) & 16777215)) { - *found = 7; - return 7; - } - if (length == 8) - goto bail; -bail: - return ((length * 24) + 7) / 8; -} - -static uint32_t -linsearch25_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { - uint32_t tmp, tmp2; - (void)tmp2; - if (length == 0) - return 0; - value -= base; - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 33554431)) { - *found = 0; - return 0; - } - if (length == 1) - goto bail; - tmp2 = tmp >> 25; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 18)) << (25 - 18)) == value) { - *found = 1; - return 1; - } - if (length == 2) - goto bail; - tmp2 = tmp >> 18; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 11)) << (25 - 11)) == value) { - *found = 2; - return 2; - } - if (length == 3) - goto bail; - tmp2 = tmp >> 11; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (25 - 4)) == value) { - *found = 3; - return 3; - } - if (length == 4) - goto bail; - if (value == ((tmp >> 4) & 33554431)) { - *found = 4; - return 4; - } - if (length == 5) - goto bail; - tmp2 = tmp >> 29; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 22)) << (25 - 22)) == value) { - *found = 5; - return 5; - } - if (length == 6) - goto bail; - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 15)) << (25 - 15)) == value) { - *found = 6; - return 6; - } - if (length == 7) - goto bail; - tmp2 = tmp >> 15; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (25 - 8)) == value) { - *found = 7; - return 7; - } - if (length == 8) - goto bail; -bail: - return ((length * 25) + 7) / 8; -} - -static uint32_t -linsearch26_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { - uint32_t tmp, tmp2; - (void)tmp2; - if (length == 0) - return 0; - value -= base; - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 67108863)) { - *found = 0; - return 0; - } - if (length == 1) - goto bail; - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 20)) << (26 - 20)) == value) { - *found = 1; - return 1; - } - if (length == 2) - goto bail; - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 14)) << (26 - 14)) == value) { - *found = 2; - return 2; - } - if (length == 3) - goto bail; - tmp2 = tmp >> 14; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (26 - 8)) == value) { - *found = 3; - return 3; - } - if (length == 4) - goto bail; - tmp2 = tmp >> 8; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (26 - 2)) == value) { - *found = 4; - return 4; - } - if (length == 5) - goto bail; - if (value == ((tmp >> 2) & 67108863)) { - *found = 5; - return 5; - } - if (length == 6) - goto bail; - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 22)) << (26 - 22)) == value) { - *found = 6; - return 6; - } - if (length == 7) - goto bail; - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (26 - 16)) == value) { - *found = 7; - return 7; - } - if (length == 8) - goto bail; -bail: - return ((length * 26) + 7) / 8; -} - -static uint32_t -linsearch27_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { - uint32_t tmp, tmp2; - (void)tmp2; - if (length == 0) - return 0; - value -= base; - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 134217727)) { - *found = 0; - return 0; - } - if (length == 1) - goto bail; - tmp2 = tmp >> 27; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 22)) << (27 - 22)) == value) { - *found = 1; - return 1; - } - if (length == 2) - goto bail; - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 17)) << (27 - 17)) == value) { - *found = 2; - return 2; - } - if (length == 3) - goto bail; - tmp2 = tmp >> 17; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (27 - 12)) == value) { - *found = 3; - return 3; - } - if (length == 4) - goto bail; - tmp2 = tmp >> 12; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 7)) << (27 - 7)) == value) { - *found = 4; - return 4; - } - if (length == 5) - goto bail; - tmp2 = tmp >> 7; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 2)) << (27 - 2)) == value) { - *found = 5; - return 5; - } - if (length == 6) - goto bail; - if (value == ((tmp >> 2) & 134217727)) { - *found = 6; - return 6; - } - if (length == 7) - goto bail; - tmp2 = tmp >> 29; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 24)) << (27 - 24)) == value) { - *found = 7; - return 7; - } - if (length == 8) - goto bail; -bail: - return ((length * 27) + 7) / 8; -} - -static uint32_t -linsearch28_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { - uint32_t tmp, tmp2; - (void)tmp2; - if (length == 0) - return 0; - value -= base; - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 268435455)) { - *found = 0; - return 0; - } - if (length == 1) - goto bail; - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 24)) << (28 - 24)) == value) { - *found = 1; - return 1; - } - if (length == 2) - goto bail; - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 20)) << (28 - 20)) == value) { - *found = 2; - return 2; - } - if (length == 3) - goto bail; - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (28 - 16)) == value) { - *found = 3; - return 3; - } - if (length == 4) - goto bail; - tmp2 = tmp >> 16; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 12)) << (28 - 12)) == value) { - *found = 4; - return 4; - } - if (length == 5) - goto bail; - tmp2 = tmp >> 12; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (28 - 8)) == value) { - *found = 5; - return 5; - } - if (length == 6) - goto bail; - tmp2 = tmp >> 8; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 4)) << (28 - 4)) == value) { - *found = 6; - return 6; - } - if (length == 7) - goto bail; - if (value == ((tmp >> 4) & 268435455)) { - *found = 7; - return 7; - } - if (length == 8) - goto bail; -bail: - return ((length * 28) + 7) / 8; -} - -static uint32_t -linsearch29_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { - uint32_t tmp, tmp2; - (void)tmp2; - if (length == 0) - return 0; - value -= base; - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 536870911)) { - *found = 0; - return 0; - } - if (length == 1) - goto bail; - tmp2 = tmp >> 29; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 26)) << (29 - 26)) == value) { - *found = 1; - return 1; - } - if (length == 2) - goto bail; - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 23)) << (29 - 23)) == value) { - *found = 2; - return 2; - } - if (length == 3) - goto bail; - tmp2 = tmp >> 23; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 20)) << (29 - 20)) == value) { - *found = 3; - return 3; - } - if (length == 4) - goto bail; - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 17)) << (29 - 17)) == value) { - *found = 4; - return 4; - } - if (length == 5) - goto bail; - tmp2 = tmp >> 17; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 14)) << (29 - 14)) == value) { - *found = 5; - return 5; - } - if (length == 6) - goto bail; - tmp2 = tmp >> 14; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 11)) << (29 - 11)) == value) { - *found = 6; - return 6; - } - if (length == 7) - goto bail; - tmp2 = tmp >> 11; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 8)) << (29 - 8)) == value) { - *found = 7; - return 7; - } - if (length == 8) - goto bail; -bail: - return ((length * 29) + 7) / 8; -} - -static uint32_t -linsearch30_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { - uint32_t tmp, tmp2; - (void)tmp2; - if (length == 0) - return 0; - value -= base; - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 1073741823)) { - *found = 0; - return 0; - } - if (length == 1) - goto bail; - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 28)) << (30 - 28)) == value) { - *found = 1; - return 1; - } - if (length == 2) - goto bail; - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 26)) << (30 - 26)) == value) { - *found = 2; - return 2; - } - if (length == 3) - goto bail; - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 24)) << (30 - 24)) == value) { - *found = 3; - return 3; - } - if (length == 4) - goto bail; - tmp2 = tmp >> 24; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 22)) << (30 - 22)) == value) { - *found = 4; - return 4; - } - if (length == 5) - goto bail; - tmp2 = tmp >> 22; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 20)) << (30 - 20)) == value) { - *found = 5; - return 5; - } - if (length == 6) - goto bail; - tmp2 = tmp >> 20; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 18)) << (30 - 18)) == value) { - *found = 6; - return 6; - } - if (length == 7) - goto bail; - tmp2 = tmp >> 18; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 16)) << (30 - 16)) == value) { - *found = 7; - return 7; - } - if (length == 8) - goto bail; -bail: - return ((length * 30) + 7) / 8; -} - -static uint32_t -linsearch31_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { - uint32_t tmp, tmp2; - (void)tmp2; - if (length == 0) - return 0; - value -= base; - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 2147483647)) { - *found = 0; - return 0; - } - if (length == 1) - goto bail; - tmp2 = tmp >> 31; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 30)) << (31 - 30)) == value) { - *found = 1; - return 1; - } - if (length == 2) - goto bail; - tmp2 = tmp >> 30; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 29)) << (31 - 29)) == value) { - *found = 2; - return 2; - } - if (length == 3) - goto bail; - tmp2 = tmp >> 29; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 28)) << (31 - 28)) == value) { - *found = 3; - return 3; - } - if (length == 4) - goto bail; - tmp2 = tmp >> 28; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 27)) << (31 - 27)) == value) { - *found = 4; - return 4; - } - if (length == 5) - goto bail; - tmp2 = tmp >> 27; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 26)) << (31 - 26)) == value) { - *found = 5; - return 5; - } - if (length == 6) - goto bail; - tmp2 = tmp >> 26; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 25)) << (31 - 25)) == value) { - *found = 6; - return 6; - } - if (length == 7) - goto bail; - tmp2 = tmp >> 25; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if ((tmp2 | (tmp % (1U << 24)) << (31 - 24)) == value) { - *found = 7; - return 7; - } - if (length == 8) - goto bail; -bail: - return ((length * 31) + 7) / 8; -} - -static uint32_t -linsearch32_x(uint32_t base, const uint8_t *in, uint32_t length, uint32_t value, int *found) { - uint32_t tmp, tmp2; - (void)tmp2; - if (length == 0) - return 0; - value -= base; - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 4294967295)) { - *found = 0; - return 0; - } - if (length == 1) - goto bail; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 4294967295)) { - *found = 1; - return 1; - } - if (length == 2) - goto bail; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 4294967295)) { - *found = 2; - return 2; - } - if (length == 3) - goto bail; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 4294967295)) { - *found = 3; - return 3; - } - if (length == 4) - goto bail; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 4294967295)) { - *found = 4; - return 4; - } - if (length == 5) - goto bail; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 4294967295)) { - *found = 5; - return 5; - } - if (length == 6) - goto bail; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 4294967295)) { - *found = 6; - return 6; - } - if (length == 7) - goto bail; - in += sizeof(uint32_t); - tmp = *(uint32_t *)in; - if (value == ((tmp >> 0) & 4294967295)) { - *found = 7; - return 7; - } - if (length == 8) - goto bail; -bail: - return ((length * 32) + 7) / 8; -} - -for_linsearchxfunc_t for_linsearchx[33] = { - linsearch0_x, - linsearch1_x, - linsearch2_x, - linsearch3_x, - linsearch4_x, - linsearch5_x, - linsearch6_x, - linsearch7_x, - linsearch8_x, - linsearch9_x, - linsearch10_x, - linsearch11_x, - linsearch12_x, - linsearch13_x, - linsearch14_x, - linsearch15_x, - linsearch16_x, - linsearch17_x, - linsearch18_x, - linsearch19_x, - linsearch20_x, - linsearch21_x, - linsearch22_x, - linsearch23_x, - linsearch24_x, - linsearch25_x, - linsearch26_x, - linsearch27_x, - linsearch28_x, - linsearch29_x, - linsearch30_x, - linsearch31_x, - linsearch32_x -}; diff --git a/ext/for/for.c b/ext/for/for.c deleted file mode 100644 index e8c75a1..0000000 --- a/ext/for/for.c +++ /dev/null @@ -1,402 +0,0 @@ -/* - * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "for.h" - -#include -#include /* for memcpy */ - -#if defined(_MSC_VER) && _MSC_VER < 1600 -typedef unsigned int uint32_t; -typedef unsigned char uint8_t; -typedef signed char int8_t; -#else -# include -#endif - -#define METADATA 5 /* size of metadata overhead */ - -#ifdef _MSC_VER -# define INLINE __inline -# include - -uint32_t __inline CLZ(uint32_t value) { - uint32_t leading_zero = 0; - _BitScanReverse(&leading_zero, value); - return 31 - leading_zero; -} -#else -# define INLINE inline -# define CLZ __builtin_clz -#endif - -typedef uint32_t(*for_unpackfunc_t) (uint32_t, const uint8_t *, uint32_t *); -typedef uint32_t(*for_packfunc_t) (uint32_t, const uint32_t *, uint8_t *); -typedef uint32_t(*for_unpackxfunc_t) (uint32_t, const uint8_t *, uint32_t *, - uint32_t); -typedef uint32_t(*for_packxfunc_t) (uint32_t, const uint32_t *, uint8_t *, - uint32_t); -typedef uint32_t(*for_linsearchfunc_t)(uint32_t, const uint8_t *, uint32_t, - int *); -typedef uint32_t(*for_linsearchxfunc_t)(uint32_t, const uint8_t *, uint32_t, - uint32_t, int *); - -/* include the generated file */ -#include "for-gen.c" - -static INLINE uint32_t -bits(const uint32_t v) -{ - return v == 0 ? 0 : 32 - CLZ(v); -} - -uint32_t -for_compressed_size_bits(uint32_t length, uint32_t bits) -{ - uint32_t c = 0; - uint32_t b; - - assert(bits <= 32); - - /* each block is byte-aligned */ - if (length >= 32) { - b = length / 32; - c += ((b * 32 * bits) + 7) / 8; - length %= 32; - } - - if (length >= 16) { - b = length / 16; - c += ((b * 16 * bits) + 7) / 8; - length %= 16; - } - - if (length >= 8) { - b = length / 8; - c += ((b * 8 * bits) + 7) / 8; - length %= 8; - } - - return c + ((length * bits) + 7) / 8; -} - -uint32_t -for_compressed_size_unsorted(const uint32_t *in, uint32_t length) -{ - uint32_t i, b, m, M; - - if (length == 0) - return 0; - - /* calculate min/max */ - m = in[0]; - M = m; - - for (i = 1; i < length; i++) { - if (in[i] < m) - m = in[i]; - if (in[i] > M) - M = in[i]; - } - - /* calculate the bits */ - b = bits(M - m); - - return METADATA + for_compressed_size_bits(length, b); -} - -uint32_t -for_compressed_size_sorted(const uint32_t *in, uint32_t length) -{ - uint32_t b, m, M; - - if (length == 0) - return 0; - - /* calculate min/max */ - m = in[0]; - M = in[length - 1]; - - /* calculate the bits */ - b = bits(M - m); - - return METADATA + for_compressed_size_bits(length, b); -} - -uint32_t -for_compress_bits(const uint32_t *in, uint8_t *out, uint32_t length, - uint32_t base, uint32_t bits) -{ - uint32_t i = 0; - uint32_t written = 0; - - assert(bits <= 32); - - for (; i + 32 <= length; i += 32, in += 32) - written += for_pack32[bits](base, in, out + written); - - for (; i + 16 <= length; i += 16, in += 16) - written += for_pack16[bits](base, in, out + written); - - for (; i + 8 <= length; i += 8, in += 8) - written += for_pack8[bits](base, in, out + written); - - return written + for_packx[bits](base, in, out + written, length - i); -} - -uint32_t -for_compress_unsorted(const uint32_t *in, uint8_t *out, uint32_t length) -{ - uint32_t i, b, m, M; - - if (length == 0) - return 0; - - /* calculate min/max */ - m = in[0]; - M = m; - - for (i = 1; i < length; i++) { - if (in[i] < m) - m = in[i]; - if (in[i] > M) - M = in[i]; - } - - /* calculate the bits */ - b = bits(M - m); - - /* store m and the bits */ - *(uint32_t *)(out + 0) = m; - *(uint32_t *)(out + 4) = b; - return METADATA + for_compress_bits(in, out + METADATA, length, m, b); -} - -uint32_t -for_compress_sorted(const uint32_t *in, uint8_t *out, uint32_t length) -{ - uint32_t m, M, b; - - if (length == 0) - return 0; - - /* fetch min/max */ - m = in[0]; - M = in[length - 1]; - - /* calculate the bits */ - b = bits(M - m); - - /* store m and the bits */ - *(uint32_t *)(out + 0) = m; - *(uint32_t *)(out + 4) = b; - - return METADATA + for_compress_bits(in, out + METADATA, length, m, b); -} - -uint32_t -for_uncompress_bits(const uint8_t *in, uint32_t *out, uint32_t length, - uint32_t base, uint32_t bits) -{ - uint32_t i = 0; - const uint8_t *bin = in; - - assert(bits <= 32); - - for (; i + 32 <= length; i += 32, out += 32) - in += for_unpack32[bits](base, in, out); - - for (; i + 16 <= length; i += 16, out += 16) - in += for_unpack16[bits](base, in, out); - - for (; i + 8 <= length; i += 8, out += 8) - in += for_unpack8[bits](base, in, out); - - return (in - bin) + for_unpackx[bits](base, in, out, length - i); -} - -uint32_t -for_uncompress(const uint8_t *in, uint32_t *out, uint32_t length) -{ - uint32_t m, b; - - if (length == 0) - return 0; - - /* load min and the bits */ - m = *(uint32_t *)(in + 0); - b = *(in + 4); - - return METADATA + for_uncompress_bits(in + METADATA, out, length, m, b); -} - -uint32_t -for_select_bits(const uint8_t *in, uint32_t base, uint32_t bits, - uint32_t index) -{ - uint32_t b, start; - const uint32_t *in32; - - assert(bits <= 32); - - if (bits == 32) { - in32 = (uint32_t *)in; - return base + in32[index]; - } - - if (index > 32) { - b = index / 32; - in += (b * 32 * bits) / 8; - index %= 32; - } - - if (index > 16) { - b = index / 16; - in += (b * 16 * bits) / 8; - index %= 16; - } - - if (index > 8) { - b = index / 8; - in += (b * 8 * bits) / 8; - index %= 8; - } - - start = index * bits; - - in += start / 8; - start %= 8; - - /* |in| now points to the byte where the requested index is stored */ - /* |start| is the bit position where the compressed value starts */ - - in32 = (uint32_t *)in; - - /* easy common case: the compressed value is not split between words */ - if (start + bits < 32) { - uint32_t mask = (1 << bits) - 1; - return base + ((*in32 >> start) & mask); - } - /* not so easy: restore value from two words */ - else { - uint32_t mask1 = (1 << bits) - 1; - uint32_t mask2 = (1 << (bits - (32 - start))) - 1; - uint32_t v1 = (*(in32 + 0) >> start) & mask1; - uint32_t v2 = *(in32 + 1) & mask2; - return base + ((v2 << (32 - start)) | v1); - } -} - -uint32_t -for_select(const uint8_t *in, uint32_t index) -{ - /* load min and the bits */ - uint32_t m = *(uint32_t *)(in + 0); - uint32_t b = *(in + 4); - - return for_select_bits(in + METADATA, m, b, index); -} - -uint32_t -for_linear_search(const uint8_t *in, uint32_t length, uint32_t value) -{ - /* load min and the bits */ - uint32_t m = *(uint32_t *)(in + 0); - uint32_t b = *(in + 4); - - return for_linear_search_bits(in + METADATA, length, m, b, value); -} - -uint32_t -for_linear_search_bits(const uint8_t *in, uint32_t length, uint32_t base, - uint32_t bits, uint32_t value) -{ - uint32_t i = 0; - int found = -1; - - assert(bits <= 32); - if (bits == 0) - return (value == base ? 0 : length); - - for (; i + 32 <= length; i += 32) { - in += for_linsearch32[bits](base, in, value, &found); - if (found >= 0) - return i + found; - } - - for (; i + 16 <= length; i += 16) { - in += for_linsearch16[bits](base, in, value, &found); - if (found >= 0) - return i + found; - } - - for (; i + 8 <= length; i += 8) { - in += for_linsearch8[bits](base, in, value, &found); - if (found >= 0) - return i + found; - } - - for_linsearchx[bits](base, in, length - i, value, &found); - if (found >= 0) - return i + found; - - /* not found */ - return length; -} - -uint32_t -for_lower_bound_search(const uint8_t *in, uint32_t length, uint32_t value, - uint32_t *actual) -{ - /* load min and the bits */ - uint32_t m = *(uint32_t *)(in + 0); - uint32_t b = *(in + 4); - - return for_lower_bound_search_bits(in + METADATA, length, m, b, - value, actual); -} - -/* adapted from wikipedia */ -uint32_t -for_lower_bound_search_bits(const uint8_t *in, uint32_t length, uint32_t base, - uint32_t bits, uint32_t value, uint32_t *actual) -{ - uint32_t imid; - uint32_t imin = 0; - uint32_t imax = length - 1; - uint32_t v; - - while (imin + 1 < imax) { - imid = imin + ((imax - imin) / 2); - - v = for_select_bits(in, base, bits, imid); - if (v >= value) { - imax = imid; - } - else if (v < value) { - imin = imid; - } - } - - v = for_select_bits(in, base, bits, imin); - if (v >= value) { - *actual = v; - return imin; - } - - v = for_select_bits(in, base, bits, imax); - *actual = v; - return imax; -} diff --git a/ext/for/for.h b/ext/for/for.h deleted file mode 100644 index a8e0e14..0000000 --- a/ext/for/for.h +++ /dev/null @@ -1,241 +0,0 @@ -/* - * Copyright (C) 2005-2015 Christoph Rupp (chris@crupp.de). - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * A fast implementation for Frame of Reference encoding. - * - * See the README.md file for more information, example code and references. - * - * Feel free to send comments/questions to chris@crupp.de. I am available - * for consulting. - */ - -#ifndef FOR_H_5580af15_4570_41f9_ba2b_8afb1400e81e -#define FOR_H_5580af15_4570_41f9_ba2b_8afb1400e81e - -#include - -#ifdef __cplusplus -extern "C" { -#endif - - -/** - * Returns the size required to compress a sequence of |length| ints, - * each compressed with |bits| bits - * - * This function will NOT include any overhead required by - * for_compress_sorted() and for_compress_unsorted(). - * - * Invariant: bits <= 32 - */ -extern uint32_t -for_compressed_size_bits(uint32_t length, uint32_t bits); - -/** - * Returns the size required to compress an unsorted sequence of |length| ints. - * - * This routine scans |in| for the min/max values and then calls - * for_compressed_size_bits(). - * - * The returned size will include the overhead required for - * for_compress_sorted() and for_compressed_unsorted(). - */ -extern uint32_t -for_compressed_size_unsorted(const uint32_t *in, uint32_t length); - -/** - * Returns the size required to compress a sorted sequence of |length| ints. - * - * This routine extracts min/max values at the beginning and end of - * the sequence, then calls for_compressed_size_bits(). It is therefore - * slightly faster than for_compressed_size_unsorted(). - * - * The returned size will include the overhead required for - * for_compress_sorted() and for_compressed_unsorted(). - */ -extern uint32_t -for_compressed_size_sorted(const uint32_t *in, uint32_t length); - -/** - * Compresses a sequence of |length| ints at |in| and stores the result - * in |out|. - * - * |base| is the "offset" (or common delta value) of all ints. It is usually - * set to the minimum value of the uncompressed sequence. - * - * |bits| are the bits required to store a single integer. - * - * Returns the number of bytes used for compression. - * - * This is for advanced users who opt for storing |base| and |bits| on their - * own. This function is called by for_compress_sorted() and - * for_compress_unsorted(). - * - * Invariant: bits <= 32 - */ -extern uint32_t -for_compress_bits(const uint32_t *in, uint8_t *out, uint32_t length, - uint32_t base, uint32_t bits); - -/** - * Compresses an unsorted sequence of |length| ints at |in| and stores the - * result in |out|. - * - * This routine scans |in| for the min/max values and then calls - * for_compress_bits(). - * - * The minimun value and the bits are stored as metadata in |out|. - */ -extern uint32_t -for_compress_unsorted(const uint32_t *in, uint8_t *out, uint32_t length); - -/** - * Compresses a sorted sequence of |length| ints at |in| and stores the - * result in |out|. - * - * This routine extracts min/max values at the beginning and end of - * the sequence, then calls for_compress_bits(). - * - * The minimun value and the bits are stored as metadata in |out|. - */ -extern uint32_t -for_compress_sorted(const uint32_t *in, uint8_t *out, uint32_t length); - -/** - * Uncompresses a sequence of |length| ints at |in| and stores the - * result in |out|. - * - * |base| is the "offset" (or common delta value) of all ints. It is usually - * set to the minimum value of the uncompressed sequence. - * - * |bits| are the bits required to store a single integer. - * - * Returns the number of compressed bytes processed. - * - * This function is for advanced users. It is the counterpart of - * for_compress_bits(). - * - * Invariant: bits <= 32 - */ -extern uint32_t -for_uncompress_bits(const uint8_t *in, uint32_t *out, uint32_t length, - uint32_t base, uint32_t bits); - -/** - * Uncompresses a sequence of |length| ints at |in| and stores the - * result in |out|. - * - * This function is a convenience wrapper for for_uncompress_bits(). It - * expects metadata at the beginning of |in|. Use in combination with - * for_compress_sorted() and for_compress_unsorted(). - * - * Returns the number of compressed bytes processed. - */ -extern uint32_t -for_uncompress(const uint8_t *in, uint32_t *out, uint32_t length); - -/** - * Returns the value at the given |index| from a compressed sequence. - * - * Make sure that |index| does not exceed the length of the sequence. - * - * |base| is the "offset" (or common delta value) of all ints. It is usually - * set to the minimum value of the uncompressed sequence. - * - * Invariant: bits <= 32 - */ -extern uint32_t -for_select_bits(const uint8_t *in, uint32_t base, uint32_t bits, - uint32_t index); - -/** - * Returns the value at the given |index| from a compressed sequence. - * - * Make sure that |index| does not exceed the length of the sequence. - * - * This function is a convenience wrapper for for_select_bits(). It - * expects metadata at the beginning of |in|. Use in combination with - * for_compress_sorted() and for_compress_unsorted(). - */ -extern uint32_t -for_select(const uint8_t *in, uint32_t index); - -/** - * Performs a linear search for |value|. - * - * Returns the index of the found element, or |length| if the key was not - * found. - * - * This function is a convenience wrapper for for_linear_search_bits(). It - * expects metadata at the beginning of |in|. Use in combination with - * for_compress_sorted() and for_compress_unsorted(). - */ -extern uint32_t -for_linear_search(const uint8_t *in, uint32_t length, uint32_t value); - -/** - * Performs a linear search for |value|. - * - * Returns the index of the found element, or |length| if the key was not - * found. - * - * |base| is the "offset" (or common delta value) of all ints. It is usually - * set to the minimum value of the uncompressed sequence. - * - * Invariant: bits <= 32 - */ -extern uint32_t -for_linear_search_bits(const uint8_t *in, uint32_t length, uint32_t base, - uint32_t bits, uint32_t value); - -/** - * Performs lower bound binary search search for |value|. - * - * A lower bound search returns the first element in the sequence which does - * not compare less than |value|. - * The actual result is stored in |*actual|. - * - * This function is a convenience wrapper for for_lower_bound_search_bits(). It - * expects metadata at the beginning of |in|. Use in combination with - * for_compress_sorted() and for_compress_unsorted(). - */ -extern uint32_t -for_lower_bound_search(const uint8_t *in, uint32_t length, uint32_t value, - uint32_t *actual); - -/** - * Performs lower bound binary search search for |value|. - * - * A lower bound search returns the first element in the sequence which does - * not compare less than |value|. - * The actual result is stored in |*actual|. - * - * |base| is the "offset" (or common delta value) of all ints. It is usually - * set to the minimum value of the uncompressed sequence. - * - * Invariant: bits <= 32 - */ -extern uint32_t -for_lower_bound_search_bits(const uint8_t *in, uint32_t length, uint32_t base, - uint32_t bits, uint32_t value, uint32_t *actual); - - -#ifdef __cplusplus -} /* extern "C" */ -#endif - -#endif /* FOR_H_5580af15_4570_41f9_ba2b_8afb1400e81e */ diff --git a/ext/libfor b/ext/libfor new file mode 160000 index 0000000..4961180 --- /dev/null +++ b/ext/libfor @@ -0,0 +1 @@ +Subproject commit 49611808d08d4e47116aa2a3ddcabeb418f405f7 diff --git a/ext/libvbyte b/ext/libvbyte new file mode 160000 index 0000000..4fb7258 --- /dev/null +++ b/ext/libvbyte @@ -0,0 +1 @@ +Subproject commit 4fb7258ec96228b7d6008d511a39bb128e573def diff --git a/ext/lz4 b/ext/lz4 new file mode 160000 index 0000000..8c4de60 --- /dev/null +++ b/ext/lz4 @@ -0,0 +1 @@ +Subproject commit 8c4de60d0f871f8e81986b2c61b70e1bcb8b97fd diff --git a/ext/lz4.c b/ext/lz4.c deleted file mode 100644 index bcb43c3..0000000 --- a/ext/lz4.c +++ /dev/null @@ -1,1515 +0,0 @@ -/* - LZ4 - Fast LZ compression algorithm - Copyright (C) 2011-2015, Yann Collet. - - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following disclaimer - in the documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - You can contact the author at : - - LZ4 source repository : https://github.com/Cyan4973/lz4 - - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c -*/ - - -/************************************** -* Tuning parameters -**************************************/ -/* - * HEAPMODE : - * Select how default compression functions will allocate memory for their hash table, - * in memory stack (0:default, fastest), or in memory heap (1:requires malloc()). - */ -#define HEAPMODE 0 - -/* - * ACCELERATION_DEFAULT : - * Select the value of "acceleration" for LZ4_compress_fast() when parameter == 0 - */ -#define ACCELERATION_DEFAULT 1 - - -/************************************** -* CPU Feature Detection -**************************************/ -/* - * LZ4_FORCE_SW_BITCOUNT - * Define this parameter if your target system or compiler does not support hardware bit count - */ -#if defined(_MSC_VER) && defined(_WIN32_WCE) /* Visual Studio for Windows CE does not support Hardware bit count */ -# define LZ4_FORCE_SW_BITCOUNT -#endif - - -/************************************** -* Includes -**************************************/ -#include "lz4.h" - - -/************************************** -* Compiler Options -**************************************/ -#ifdef _MSC_VER /* Visual Studio */ -# define FORCE_INLINE static __forceinline -# include -# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ -# pragma warning(disable : 4293) /* disable: C4293: too large shift (32-bits) */ -#else -# if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */ -# if defined(__GNUC__) || defined(__clang__) -# define FORCE_INLINE static inline __attribute__((always_inline)) -# else -# define FORCE_INLINE static inline -# endif -# else -# define FORCE_INLINE static -# endif /* __STDC_VERSION__ */ -#endif /* _MSC_VER */ - -/* LZ4_GCC_VERSION is defined into lz4.h */ -#if (LZ4_GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__) -# define expect(expr,value) (__builtin_expect ((expr),(value)) ) -#else -# define expect(expr,value) (expr) -#endif - -#define likely(expr) expect((expr) != 0, 1) -#define unlikely(expr) expect((expr) != 0, 0) - - -/************************************** -* Memory routines -**************************************/ -#include /* malloc, calloc, free */ -#define ALLOCATOR(n,s) calloc(n,s) -#define FREEMEM free -#include /* memset, memcpy */ -#define MEM_INIT memset - - -/************************************** -* Basic Types -**************************************/ -#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */ -# include - typedef uint8_t BYTE; - typedef uint16_t U16; - typedef uint32_t U32; - typedef int32_t S32; - typedef uint64_t U64; -#else - typedef unsigned char BYTE; - typedef unsigned short U16; - typedef unsigned int U32; - typedef signed int S32; - typedef unsigned long long U64; -#endif - - -/************************************** -* Reading and writing into memory -**************************************/ -#define STEPSIZE sizeof(size_t) - -static unsigned LZ4_64bits(void) { return sizeof(void*)==8; } - -static unsigned LZ4_isLittleEndian(void) -{ - const union { U32 i; BYTE c[4]; } one = { 1 }; /* don't use static : performance detrimental */ - return one.c[0]; -} - - -static U16 LZ4_read16(const void* memPtr) -{ - U16 val16; - memcpy(&val16, memPtr, 2); - return val16; -} - -static U16 LZ4_readLE16(const void* memPtr) -{ - if (LZ4_isLittleEndian()) - { - return LZ4_read16(memPtr); - } - else - { - const BYTE* p = (const BYTE*)memPtr; - return (U16)((U16)p[0] + (p[1]<<8)); - } -} - -static void LZ4_writeLE16(void* memPtr, U16 value) -{ - if (LZ4_isLittleEndian()) - { - memcpy(memPtr, &value, 2); - } - else - { - BYTE* p = (BYTE*)memPtr; - p[0] = (BYTE) value; - p[1] = (BYTE)(value>>8); - } -} - -static U32 LZ4_read32(const void* memPtr) -{ - U32 val32; - memcpy(&val32, memPtr, 4); - return val32; -} - -static U64 LZ4_read64(const void* memPtr) -{ - U64 val64; - memcpy(&val64, memPtr, 8); - return val64; -} - -static size_t LZ4_read_ARCH(const void* p) -{ - if (LZ4_64bits()) - return (size_t)LZ4_read64(p); - else - return (size_t)LZ4_read32(p); -} - - -static void LZ4_copy4(void* dstPtr, const void* srcPtr) { memcpy(dstPtr, srcPtr, 4); } - -static void LZ4_copy8(void* dstPtr, const void* srcPtr) { memcpy(dstPtr, srcPtr, 8); } - -/* customized version of memcpy, which may overwrite up to 7 bytes beyond dstEnd */ -static void LZ4_wildCopy(void* dstPtr, const void* srcPtr, void* dstEnd) -{ - BYTE* d = (BYTE*)dstPtr; - const BYTE* s = (const BYTE*)srcPtr; - BYTE* e = (BYTE*)dstEnd; - do { LZ4_copy8(d,s); d+=8; s+=8; } while (d>3); -# elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT) - return (__builtin_ctzll((U64)val) >> 3); -# else - static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; - return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; -# endif - } - else /* 32 bits */ - { -# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) - unsigned long r; - _BitScanForward( &r, (U32)val ); - return (int)(r>>3); -# elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT) - return (__builtin_ctz((U32)val) >> 3); -# else - static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; - return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; -# endif - } - } - else /* Big Endian CPU */ - { - if (LZ4_64bits()) - { -# if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT) - unsigned long r = 0; - _BitScanReverse64( &r, val ); - return (unsigned)(r>>3); -# elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT) - return (__builtin_clzll((U64)val) >> 3); -# else - unsigned r; - if (!(val>>32)) { r=4; } else { r=0; val>>=32; } - if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } - r += (!val); - return r; -# endif - } - else /* 32 bits */ - { -# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) - unsigned long r = 0; - _BitScanReverse( &r, (unsigned long)val ); - return (unsigned)(r>>3); -# elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT) - return (__builtin_clz((U32)val) >> 3); -# else - unsigned r; - if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } - r += (!val); - return r; -# endif - } - } -} - -static unsigned LZ4_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLimit) -{ - const BYTE* const pStart = pIn; - - while (likely(pIn compression run slower on incompressible data */ - - -/************************************** -* Local Structures and types -**************************************/ -typedef struct { - U32 hashTable[HASH_SIZE_U32]; - U32 currentOffset; - U32 initCheck; - const BYTE* dictionary; - BYTE* bufferStart; /* obsolete, used for slideInputBuffer */ - U32 dictSize; -} LZ4_stream_t_internal; - -typedef enum { notLimited = 0, limitedOutput = 1 } limitedOutput_directive; -typedef enum { byPtr, byU32, byU16 } tableType_t; - -typedef enum { noDict = 0, withPrefix64k, usingExtDict } dict_directive; -typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive; - -typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive; -typedef enum { full = 0, partial = 1 } earlyEnd_directive; - - -/************************************** -* Local Utils -**************************************/ -int LZ4_versionNumber (void) { return LZ4_VERSION_NUMBER; } -int LZ4_compressBound(int isize) { return LZ4_COMPRESSBOUND(isize); } -int LZ4_sizeofState() { return LZ4_STREAMSIZE; } - - - -/******************************** -* Compression functions -********************************/ - -static U32 LZ4_hashSequence(U32 sequence, tableType_t const tableType) -{ - if (tableType == byU16) - return (((sequence) * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1))); - else - return (((sequence) * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG)); -} - -static const U64 prime5bytes = 889523592379ULL; -static U32 LZ4_hashSequence64(size_t sequence, tableType_t const tableType) -{ - const U32 hashLog = (tableType == byU16) ? LZ4_HASHLOG+1 : LZ4_HASHLOG; - const U32 hashMask = (1<> (40 - hashLog)) & hashMask; -} - -static U32 LZ4_hashSequenceT(size_t sequence, tableType_t const tableType) -{ - if (LZ4_64bits()) - return LZ4_hashSequence64(sequence, tableType); - return LZ4_hashSequence((U32)sequence, tableType); -} - -static U32 LZ4_hashPosition(const void* p, tableType_t tableType) { return LZ4_hashSequenceT(LZ4_read_ARCH(p), tableType); } - -static void LZ4_putPositionOnHash(const BYTE* p, U32 h, void* tableBase, tableType_t const tableType, const BYTE* srcBase) -{ - switch (tableType) - { - case byPtr: { const BYTE** hashTable = (const BYTE**)tableBase; hashTable[h] = p; return; } - case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = (U32)(p-srcBase); return; } - case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = (U16)(p-srcBase); return; } - } -} - -static void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) -{ - U32 h = LZ4_hashPosition(p, tableType); - LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase); -} - -static const BYTE* LZ4_getPositionOnHash(U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase) -{ - if (tableType == byPtr) { const BYTE** hashTable = (const BYTE**) tableBase; return hashTable[h]; } - if (tableType == byU32) { U32* hashTable = (U32*) tableBase; return hashTable[h] + srcBase; } - { U16* hashTable = (U16*) tableBase; return hashTable[h] + srcBase; } /* default, to ensure a return */ -} - -static const BYTE* LZ4_getPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) -{ - U32 h = LZ4_hashPosition(p, tableType); - return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase); -} - -static int LZ4_compress_generic( - void* const ctx, - const char* const source, - char* const dest, - const int inputSize, - const int maxOutputSize, - const limitedOutput_directive outputLimited, - const tableType_t tableType, - const dict_directive dict, - const dictIssue_directive dictIssue, - const U32 acceleration) -{ - LZ4_stream_t_internal* const dictPtr = (LZ4_stream_t_internal*)ctx; - - const BYTE* ip = (const BYTE*) source; - const BYTE* base; - const BYTE* lowLimit; - const BYTE* const lowRefLimit = ip - dictPtr->dictSize; - const BYTE* const dictionary = dictPtr->dictionary; - const BYTE* const dictEnd = dictionary + dictPtr->dictSize; - const size_t dictDelta = dictEnd - (const BYTE*)source; - const BYTE* anchor = (const BYTE*) source; - const BYTE* const iend = ip + inputSize; - const BYTE* const mflimit = iend - MFLIMIT; - const BYTE* const matchlimit = iend - LASTLITERALS; - - BYTE* op = (BYTE*) dest; - BYTE* const olimit = op + maxOutputSize; - - U32 forwardH; - size_t refDelta=0; - - /* Init conditions */ - if ((U32)inputSize > (U32)LZ4_MAX_INPUT_SIZE) return 0; /* Unsupported input size, too large (or negative) */ - switch(dict) - { - case noDict: - default: - base = (const BYTE*)source; - lowLimit = (const BYTE*)source; - break; - case withPrefix64k: - base = (const BYTE*)source - dictPtr->currentOffset; - lowLimit = (const BYTE*)source - dictPtr->dictSize; - break; - case usingExtDict: - base = (const BYTE*)source - dictPtr->currentOffset; - lowLimit = (const BYTE*)source; - break; - } - if ((tableType == byU16) && (inputSize>=LZ4_64Klimit)) return 0; /* Size too large (not within 64K limit) */ - if (inputSize> LZ4_skipTrigger); - - if (unlikely(forwardIp > mflimit)) goto _last_literals; - - match = LZ4_getPositionOnHash(h, ctx, tableType, base); - if (dict==usingExtDict) - { - if (match<(const BYTE*)source) - { - refDelta = dictDelta; - lowLimit = dictionary; - } - else - { - refDelta = 0; - lowLimit = (const BYTE*)source; - } - } - forwardH = LZ4_hashPosition(forwardIp, tableType); - LZ4_putPositionOnHash(ip, h, ctx, tableType, base); - - } while ( ((dictIssue==dictSmall) ? (match < lowRefLimit) : 0) - || ((tableType==byU16) ? 0 : (match + MAX_DISTANCE < ip)) - || (LZ4_read32(match+refDelta) != LZ4_read32(ip)) ); - } - - /* Catch up */ - while ((ip>anchor) && (match+refDelta > lowLimit) && (unlikely(ip[-1]==match[refDelta-1]))) { ip--; match--; } - - { - /* Encode Literal length */ - unsigned litLength = (unsigned)(ip - anchor); - token = op++; - if ((outputLimited) && (unlikely(op + litLength + (2 + 1 + LASTLITERALS) + (litLength/255) > olimit))) - return 0; /* Check output limit */ - if (litLength>=RUN_MASK) - { - int len = (int)litLength-RUN_MASK; - *token=(RUN_MASK<= 255 ; len-=255) *op++ = 255; - *op++ = (BYTE)len; - } - else *token = (BYTE)(litLength< matchlimit) limit = matchlimit; - matchLength = LZ4_count(ip+MINMATCH, match+MINMATCH, limit); - ip += MINMATCH + matchLength; - if (ip==limit) - { - unsigned more = LZ4_count(ip, (const BYTE*)source, matchlimit); - matchLength += more; - ip += more; - } - } - else - { - matchLength = LZ4_count(ip+MINMATCH, match+MINMATCH, matchlimit); - ip += MINMATCH + matchLength; - } - - if ((outputLimited) && (unlikely(op + (1 + LASTLITERALS) + (matchLength>>8) > olimit))) - return 0; /* Check output limit */ - if (matchLength>=ML_MASK) - { - *token += ML_MASK; - matchLength -= ML_MASK; - for (; matchLength >= 510 ; matchLength-=510) { *op++ = 255; *op++ = 255; } - if (matchLength >= 255) { matchLength-=255; *op++ = 255; } - *op++ = (BYTE)matchLength; - } - else *token += (BYTE)(matchLength); - } - - anchor = ip; - - /* Test end of chunk */ - if (ip > mflimit) break; - - /* Fill table */ - LZ4_putPosition(ip-2, ctx, tableType, base); - - /* Test next position */ - match = LZ4_getPosition(ip, ctx, tableType, base); - if (dict==usingExtDict) - { - if (match<(const BYTE*)source) - { - refDelta = dictDelta; - lowLimit = dictionary; - } - else - { - refDelta = 0; - lowLimit = (const BYTE*)source; - } - } - LZ4_putPosition(ip, ctx, tableType, base); - if ( ((dictIssue==dictSmall) ? (match>=lowRefLimit) : 1) - && (match+MAX_DISTANCE>=ip) - && (LZ4_read32(match+refDelta)==LZ4_read32(ip)) ) - { token=op++; *token=0; goto _next_match; } - - /* Prepare next loop */ - forwardH = LZ4_hashPosition(++ip, tableType); - } - -_last_literals: - /* Encode Last Literals */ - { - const size_t lastRun = (size_t)(iend - anchor); - if ((outputLimited) && ((op - (BYTE*)dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize)) - return 0; /* Check output limit */ - if (lastRun >= RUN_MASK) - { - size_t accumulator = lastRun - RUN_MASK; - *op++ = RUN_MASK << ML_BITS; - for(; accumulator >= 255 ; accumulator-=255) *op++ = 255; - *op++ = (BYTE) accumulator; - } - else - { - *op++ = (BYTE)(lastRun<= LZ4_compressBound(inputSize)) - { - if (inputSize < LZ4_64Klimit) - return LZ4_compress_generic(state, source, dest, inputSize, 0, notLimited, byU16, noDict, noDictIssue, acceleration); - else - return LZ4_compress_generic(state, source, dest, inputSize, 0, notLimited, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration); - } - else - { - if (inputSize < LZ4_64Klimit) - return LZ4_compress_generic(state, source, dest, inputSize, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue, acceleration); - else - return LZ4_compress_generic(state, source, dest, inputSize, maxOutputSize, limitedOutput, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration); - } -} - - -int LZ4_compress_fast(const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration) -{ -#if (HEAPMODE) - void* ctxPtr = ALLOCATOR(1, sizeof(LZ4_stream_t)); /* malloc-calloc always properly aligned */ -#else - LZ4_stream_t ctx; - void* ctxPtr = &ctx; -#endif - - int result = LZ4_compress_fast_extState(ctxPtr, source, dest, inputSize, maxOutputSize, acceleration); - -#if (HEAPMODE) - FREEMEM(ctxPtr); -#endif - return result; -} - - -int LZ4_compress_default(const char* source, char* dest, int inputSize, int maxOutputSize) -{ - return LZ4_compress_fast(source, dest, inputSize, maxOutputSize, 1); -} - - -/* hidden debug function */ -/* strangely enough, gcc generates faster code when this function is uncommented, even if unused */ -int LZ4_compress_fast_force(const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration) -{ - LZ4_stream_t ctx; - - LZ4_resetStream(&ctx); - - if (inputSize < LZ4_64Klimit) - return LZ4_compress_generic(&ctx, source, dest, inputSize, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue, acceleration); - else - return LZ4_compress_generic(&ctx, source, dest, inputSize, maxOutputSize, limitedOutput, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration); -} - - -/******************************** -* destSize variant -********************************/ - -static int LZ4_compress_destSize_generic( - void* const ctx, - const char* const src, - char* const dst, - int* const srcSizePtr, - const int targetDstSize, - const tableType_t tableType) -{ - const BYTE* ip = (const BYTE*) src; - const BYTE* base = (const BYTE*) src; - const BYTE* lowLimit = (const BYTE*) src; - const BYTE* anchor = ip; - const BYTE* const iend = ip + *srcSizePtr; - const BYTE* const mflimit = iend - MFLIMIT; - const BYTE* const matchlimit = iend - LASTLITERALS; - - BYTE* op = (BYTE*) dst; - BYTE* const oend = op + targetDstSize; - BYTE* const oMaxLit = op + targetDstSize - 2 /* offset */ - 8 /* because 8+MINMATCH==MFLIMIT */ - 1 /* token */; - BYTE* const oMaxMatch = op + targetDstSize - (LASTLITERALS + 1 /* token */); - BYTE* const oMaxSeq = oMaxLit - 1 /* token */; - - U32 forwardH; - - - /* Init conditions */ - if (targetDstSize < 1) return 0; /* Impossible to store anything */ - if ((U32)*srcSizePtr > (U32)LZ4_MAX_INPUT_SIZE) return 0; /* Unsupported input size, too large (or negative) */ - if ((tableType == byU16) && (*srcSizePtr>=LZ4_64Klimit)) return 0; /* Size too large (not within 64K limit) */ - if (*srcSizePtr> LZ4_skipTrigger); - - if (unlikely(forwardIp > mflimit)) - goto _last_literals; - - match = LZ4_getPositionOnHash(h, ctx, tableType, base); - forwardH = LZ4_hashPosition(forwardIp, tableType); - LZ4_putPositionOnHash(ip, h, ctx, tableType, base); - - } while ( ((tableType==byU16) ? 0 : (match + MAX_DISTANCE < ip)) - || (LZ4_read32(match) != LZ4_read32(ip)) ); - } - - /* Catch up */ - while ((ip>anchor) && (match > lowLimit) && (unlikely(ip[-1]==match[-1]))) { ip--; match--; } - - { - /* Encode Literal length */ - unsigned litLength = (unsigned)(ip - anchor); - token = op++; - if (op + ((litLength+240)/255) + litLength > oMaxLit) - { - /* Not enough space for a last match */ - op--; - goto _last_literals; - } - if (litLength>=RUN_MASK) - { - unsigned len = litLength - RUN_MASK; - *token=(RUN_MASK<= 255 ; len-=255) *op++ = 255; - *op++ = (BYTE)len; - } - else *token = (BYTE)(litLength< oMaxMatch) - { - /* Match description too long : reduce it */ - matchLength = (15-1) + (oMaxMatch-op) * 255; - } - //printf("offset %5i, matchLength%5i \n", (int)(ip-match), matchLength + MINMATCH); - ip += MINMATCH + matchLength; - - if (matchLength>=ML_MASK) - { - *token += ML_MASK; - matchLength -= ML_MASK; - while (matchLength >= 255) { matchLength-=255; *op++ = 255; } - *op++ = (BYTE)matchLength; - } - else *token += (BYTE)(matchLength); - } - - anchor = ip; - - /* Test end of block */ - if (ip > mflimit) break; - if (op > oMaxSeq) break; - - /* Fill table */ - LZ4_putPosition(ip-2, ctx, tableType, base); - - /* Test next position */ - match = LZ4_getPosition(ip, ctx, tableType, base); - LZ4_putPosition(ip, ctx, tableType, base); - if ( (match+MAX_DISTANCE>=ip) - && (LZ4_read32(match)==LZ4_read32(ip)) ) - { token=op++; *token=0; goto _next_match; } - - /* Prepare next loop */ - forwardH = LZ4_hashPosition(++ip, tableType); - } - -_last_literals: - /* Encode Last Literals */ - { - size_t lastRunSize = (size_t)(iend - anchor); - if (op + 1 /* token */ + ((lastRunSize+240)/255) /* litLength */ + lastRunSize /* literals */ > oend) - { - /* adapt lastRunSize to fill 'dst' */ - lastRunSize = (oend-op) - 1; - lastRunSize -= (lastRunSize+240)/255; - } - ip = anchor + lastRunSize; - - if (lastRunSize >= RUN_MASK) - { - size_t accumulator = lastRunSize - RUN_MASK; - *op++ = RUN_MASK << ML_BITS; - for(; accumulator >= 255 ; accumulator-=255) *op++ = 255; - *op++ = (BYTE) accumulator; - } - else - { - *op++ = (BYTE)(lastRunSize<= LZ4_compressBound(*srcSizePtr)) /* compression success is guaranteed */ - { - return LZ4_compress_fast_extState(state, src, dst, *srcSizePtr, targetDstSize, 1); - } - else - { - if (*srcSizePtr < LZ4_64Klimit) - return LZ4_compress_destSize_generic(state, src, dst, srcSizePtr, targetDstSize, byU16); - else - return LZ4_compress_destSize_generic(state, src, dst, srcSizePtr, targetDstSize, LZ4_64bits() ? byU32 : byPtr); - } -} - - -int LZ4_compress_destSize(const char* src, char* dst, int* srcSizePtr, int targetDstSize) -{ -#if (HEAPMODE) - void* ctx = ALLOCATOR(1, sizeof(LZ4_stream_t)); /* malloc-calloc always properly aligned */ -#else - LZ4_stream_t ctxBody; - void* ctx = &ctxBody; -#endif - - int result = LZ4_compress_destSize_extState(ctx, src, dst, srcSizePtr, targetDstSize); - -#if (HEAPMODE) - FREEMEM(ctx); -#endif - return result; -} - - - -/******************************** -* Streaming functions -********************************/ - -LZ4_stream_t* LZ4_createStream(void) -{ - LZ4_stream_t* lz4s = (LZ4_stream_t*)ALLOCATOR(8, LZ4_STREAMSIZE_U64); - LZ4_STATIC_ASSERT(LZ4_STREAMSIZE >= sizeof(LZ4_stream_t_internal)); /* A compilation error here means LZ4_STREAMSIZE is not large enough */ - LZ4_resetStream(lz4s); - return lz4s; -} - -void LZ4_resetStream (LZ4_stream_t* LZ4_stream) -{ - MEM_INIT(LZ4_stream, 0, sizeof(LZ4_stream_t)); -} - -int LZ4_freeStream (LZ4_stream_t* LZ4_stream) -{ - FREEMEM(LZ4_stream); - return (0); -} - - -#define HASH_UNIT sizeof(size_t) -int LZ4_loadDict (LZ4_stream_t* LZ4_dict, const char* dictionary, int dictSize) -{ - LZ4_stream_t_internal* dict = (LZ4_stream_t_internal*) LZ4_dict; - const BYTE* p = (const BYTE*)dictionary; - const BYTE* const dictEnd = p + dictSize; - const BYTE* base; - - if ((dict->initCheck) || (dict->currentOffset > 1 GB)) /* Uninitialized structure, or reuse overflow */ - LZ4_resetStream(LZ4_dict); - - if (dictSize < (int)HASH_UNIT) - { - dict->dictionary = NULL; - dict->dictSize = 0; - return 0; - } - - if ((dictEnd - p) > 64 KB) p = dictEnd - 64 KB; - dict->currentOffset += 64 KB; - base = p - dict->currentOffset; - dict->dictionary = p; - dict->dictSize = (U32)(dictEnd - p); - dict->currentOffset += dict->dictSize; - - while (p <= dictEnd-HASH_UNIT) - { - LZ4_putPosition(p, dict->hashTable, byU32, base); - p+=3; - } - - return dict->dictSize; -} - - -static void LZ4_renormDictT(LZ4_stream_t_internal* LZ4_dict, const BYTE* src) -{ - if ((LZ4_dict->currentOffset > 0x80000000) || - ((size_t)LZ4_dict->currentOffset > (size_t)src)) /* address space overflow */ - { - /* rescale hash table */ - U32 delta = LZ4_dict->currentOffset - 64 KB; - const BYTE* dictEnd = LZ4_dict->dictionary + LZ4_dict->dictSize; - int i; - for (i=0; ihashTable[i] < delta) LZ4_dict->hashTable[i]=0; - else LZ4_dict->hashTable[i] -= delta; - } - LZ4_dict->currentOffset = 64 KB; - if (LZ4_dict->dictSize > 64 KB) LZ4_dict->dictSize = 64 KB; - LZ4_dict->dictionary = dictEnd - LZ4_dict->dictSize; - } -} - - -int LZ4_compress_fast_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration) -{ - LZ4_stream_t_internal* streamPtr = (LZ4_stream_t_internal*)LZ4_stream; - const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize; - - const BYTE* smallest = (const BYTE*) source; - if (streamPtr->initCheck) return 0; /* Uninitialized structure detected */ - if ((streamPtr->dictSize>0) && (smallest>dictEnd)) smallest = dictEnd; - LZ4_renormDictT(streamPtr, smallest); - if (acceleration < 1) acceleration = ACCELERATION_DEFAULT; - - /* Check overlapping input/dictionary space */ - { - const BYTE* sourceEnd = (const BYTE*) source + inputSize; - if ((sourceEnd > streamPtr->dictionary) && (sourceEnd < dictEnd)) - { - streamPtr->dictSize = (U32)(dictEnd - sourceEnd); - if (streamPtr->dictSize > 64 KB) streamPtr->dictSize = 64 KB; - if (streamPtr->dictSize < 4) streamPtr->dictSize = 0; - streamPtr->dictionary = dictEnd - streamPtr->dictSize; - } - } - - /* prefix mode : source data follows dictionary */ - if (dictEnd == (const BYTE*)source) - { - int result; - if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) - result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, withPrefix64k, dictSmall, acceleration); - else - result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, withPrefix64k, noDictIssue, acceleration); - streamPtr->dictSize += (U32)inputSize; - streamPtr->currentOffset += (U32)inputSize; - return result; - } - - /* external dictionary mode */ - { - int result; - if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) - result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, usingExtDict, dictSmall, acceleration); - else - result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, usingExtDict, noDictIssue, acceleration); - streamPtr->dictionary = (const BYTE*)source; - streamPtr->dictSize = (U32)inputSize; - streamPtr->currentOffset += (U32)inputSize; - return result; - } -} - - -/* Hidden debug function, to force external dictionary mode */ -int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int inputSize) -{ - LZ4_stream_t_internal* streamPtr = (LZ4_stream_t_internal*)LZ4_dict; - int result; - const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize; - - const BYTE* smallest = dictEnd; - if (smallest > (const BYTE*) source) smallest = (const BYTE*) source; - LZ4_renormDictT((LZ4_stream_t_internal*)LZ4_dict, smallest); - - result = LZ4_compress_generic(LZ4_dict, source, dest, inputSize, 0, notLimited, byU32, usingExtDict, noDictIssue, 1); - - streamPtr->dictionary = (const BYTE*)source; - streamPtr->dictSize = (U32)inputSize; - streamPtr->currentOffset += (U32)inputSize; - - return result; -} - - -int LZ4_saveDict (LZ4_stream_t* LZ4_dict, char* safeBuffer, int dictSize) -{ - LZ4_stream_t_internal* dict = (LZ4_stream_t_internal*) LZ4_dict; - const BYTE* previousDictEnd = dict->dictionary + dict->dictSize; - - if ((U32)dictSize > 64 KB) dictSize = 64 KB; /* useless to define a dictionary > 64 KB */ - if ((U32)dictSize > dict->dictSize) dictSize = dict->dictSize; - - memmove(safeBuffer, previousDictEnd - dictSize, dictSize); - - dict->dictionary = (const BYTE*)safeBuffer; - dict->dictSize = (U32)dictSize; - - return dictSize; -} - - - -/******************************* -* Decompression functions -*******************************/ -/* - * This generic decompression function cover all use cases. - * It shall be instantiated several times, using different sets of directives - * Note that it is essential this generic function is really inlined, - * in order to remove useless branches during compilation optimization. - */ -FORCE_INLINE int LZ4_decompress_generic( - const char* const source, - char* const dest, - int inputSize, - int outputSize, /* If endOnInput==endOnInputSize, this value is the max size of Output Buffer. */ - - int endOnInput, /* endOnOutputSize, endOnInputSize */ - int partialDecoding, /* full, partial */ - int targetOutputSize, /* only used if partialDecoding==partial */ - int dict, /* noDict, withPrefix64k, usingExtDict */ - const BYTE* const lowPrefix, /* == dest if dict == noDict */ - const BYTE* const dictStart, /* only if dict==usingExtDict */ - const size_t dictSize /* note : = 0 if noDict */ - ) -{ - /* Local Variables */ - const BYTE* ip = (const BYTE*) source; - const BYTE* const iend = ip + inputSize; - - BYTE* op = (BYTE*) dest; - BYTE* const oend = op + outputSize; - BYTE* cpy; - BYTE* oexit = op + targetOutputSize; - const BYTE* const lowLimit = lowPrefix - dictSize; - - const BYTE* const dictEnd = (const BYTE*)dictStart + dictSize; - const size_t dec32table[] = {4, 1, 2, 1, 4, 4, 4, 4}; - const size_t dec64table[] = {0, 0, 0, (size_t)-1, 0, 1, 2, 3}; - - const int safeDecode = (endOnInput==endOnInputSize); - const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB))); - - - /* Special cases */ - if ((partialDecoding) && (oexit> oend-MFLIMIT)) oexit = oend-MFLIMIT; /* targetOutputSize too high => decode everything */ - if ((endOnInput) && (unlikely(outputSize==0))) return ((inputSize==1) && (*ip==0)) ? 0 : -1; /* Empty output buffer */ - if ((!endOnInput) && (unlikely(outputSize==0))) return (*ip==0?1:-1); - - - /* Main Loop */ - while (1) - { - unsigned token; - size_t length; - const BYTE* match; - - /* get literal length */ - token = *ip++; - if ((length=(token>>ML_BITS)) == RUN_MASK) - { - unsigned s; - do - { - s = *ip++; - length += s; - } - while (likely((endOnInput)?ip(partialDecoding?oexit:oend-MFLIMIT)) || (ip+length>iend-(2+1+LASTLITERALS))) ) - || ((!endOnInput) && (cpy>oend-COPYLENGTH))) - { - if (partialDecoding) - { - if (cpy > oend) goto _output_error; /* Error : write attempt beyond end of output buffer */ - if ((endOnInput) && (ip+length > iend)) goto _output_error; /* Error : read attempt beyond end of input buffer */ - } - else - { - if ((!endOnInput) && (cpy != oend)) goto _output_error; /* Error : block decoding must stop exactly there */ - if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) goto _output_error; /* Error : input must be consumed */ - } - memcpy(op, ip, length); - ip += length; - op += length; - break; /* Necessarily EOF, due to parsing restrictions */ - } - LZ4_wildCopy(op, ip, cpy); - ip += length; op = cpy; - - /* get offset */ - match = cpy - LZ4_readLE16(ip); ip+=2; - if ((checkOffset) && (unlikely(match < lowLimit))) goto _output_error; /* Error : offset outside destination buffer */ - - /* get matchlength */ - length = token & ML_MASK; - if (length == ML_MASK) - { - unsigned s; - do - { - if ((endOnInput) && (ip > iend-LASTLITERALS)) goto _output_error; - s = *ip++; - length += s; - } while (s==255); - if ((safeDecode) && unlikely((size_t)(op+length)<(size_t)op)) goto _output_error; /* overflow detection */ - } - length += MINMATCH; - - /* check external dictionary */ - if ((dict==usingExtDict) && (match < lowPrefix)) - { - if (unlikely(op+length > oend-LASTLITERALS)) goto _output_error; /* doesn't respect parsing restriction */ - - if (length <= (size_t)(lowPrefix-match)) - { - /* match can be copied as a single segment from external dictionary */ - match = dictEnd - (lowPrefix-match); - memmove(op, match, length); op += length; - } - else - { - /* match encompass external dictionary and current segment */ - size_t copySize = (size_t)(lowPrefix-match); - memcpy(op, dictEnd - copySize, copySize); - op += copySize; - copySize = length - copySize; - if (copySize > (size_t)(op-lowPrefix)) /* overlap within current segment */ - { - BYTE* const endOfMatch = op + copySize; - const BYTE* copyFrom = lowPrefix; - while (op < endOfMatch) *op++ = *copyFrom++; - } - else - { - memcpy(op, lowPrefix, copySize); - op += copySize; - } - } - continue; - } - - /* copy repeated sequence */ - cpy = op + length; - if (unlikely((op-match)<8)) - { - const size_t dec64 = dec64table[op-match]; - op[0] = match[0]; - op[1] = match[1]; - op[2] = match[2]; - op[3] = match[3]; - match += dec32table[op-match]; - LZ4_copy4(op+4, match); - op += 8; match -= dec64; - } else { LZ4_copy8(op, match); op+=8; match+=8; } - - if (unlikely(cpy>oend-12)) - { - if (cpy > oend-LASTLITERALS) goto _output_error; /* Error : last LASTLITERALS bytes must be literals */ - if (op < oend-8) - { - LZ4_wildCopy(op, match, oend-8); - match += (oend-8) - op; - op = oend-8; - } - while (opprefixSize = (size_t) dictSize; - lz4sd->prefixEnd = (const BYTE*) dictionary + dictSize; - lz4sd->externalDict = NULL; - lz4sd->extDictSize = 0; - return 1; -} - -/* -*_continue() : - These decoding functions allow decompression of multiple blocks in "streaming" mode. - Previously decoded blocks must still be available at the memory position where they were decoded. - If it's not possible, save the relevant part of decoded data into a safe buffer, - and indicate where it stands using LZ4_setStreamDecode() -*/ -int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxOutputSize) -{ - LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode; - int result; - - if (lz4sd->prefixEnd == (BYTE*)dest) - { - result = LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, - endOnInputSize, full, 0, - usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize); - if (result <= 0) return result; - lz4sd->prefixSize += result; - lz4sd->prefixEnd += result; - } - else - { - lz4sd->extDictSize = lz4sd->prefixSize; - lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize; - result = LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, - endOnInputSize, full, 0, - usingExtDict, (BYTE*)dest, lz4sd->externalDict, lz4sd->extDictSize); - if (result <= 0) return result; - lz4sd->prefixSize = result; - lz4sd->prefixEnd = (BYTE*)dest + result; - } - - return result; -} - -int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize) -{ - LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode; - int result; - - if (lz4sd->prefixEnd == (BYTE*)dest) - { - result = LZ4_decompress_generic(source, dest, 0, originalSize, - endOnOutputSize, full, 0, - usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize); - if (result <= 0) return result; - lz4sd->prefixSize += originalSize; - lz4sd->prefixEnd += originalSize; - } - else - { - lz4sd->extDictSize = lz4sd->prefixSize; - lz4sd->externalDict = (BYTE*)dest - lz4sd->extDictSize; - result = LZ4_decompress_generic(source, dest, 0, originalSize, - endOnOutputSize, full, 0, - usingExtDict, (BYTE*)dest, lz4sd->externalDict, lz4sd->extDictSize); - if (result <= 0) return result; - lz4sd->prefixSize = originalSize; - lz4sd->prefixEnd = (BYTE*)dest + originalSize; - } - - return result; -} - - -/* -Advanced decoding functions : -*_usingDict() : - These decoding functions work the same as "_continue" ones, - the dictionary must be explicitly provided within parameters -*/ - -FORCE_INLINE int LZ4_decompress_usingDict_generic(const char* source, char* dest, int compressedSize, int maxOutputSize, int safe, const char* dictStart, int dictSize) -{ - if (dictSize==0) - return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, noDict, (BYTE*)dest, NULL, 0); - if (dictStart+dictSize == dest) - { - if (dictSize >= (int)(64 KB - 1)) - return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, withPrefix64k, (BYTE*)dest-64 KB, NULL, 0); - return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, noDict, (BYTE*)dest-dictSize, NULL, 0); - } - return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, usingExtDict, (BYTE*)dest, (const BYTE*)dictStart, dictSize); -} - -int LZ4_decompress_safe_usingDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize) -{ - return LZ4_decompress_usingDict_generic(source, dest, compressedSize, maxOutputSize, 1, dictStart, dictSize); -} - -int LZ4_decompress_fast_usingDict(const char* source, char* dest, int originalSize, const char* dictStart, int dictSize) -{ - return LZ4_decompress_usingDict_generic(source, dest, 0, originalSize, 0, dictStart, dictSize); -} - -/* debug function */ -int LZ4_decompress_safe_forceExtDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize) -{ - return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, usingExtDict, (BYTE*)dest, (const BYTE*)dictStart, dictSize); -} - - -/*************************************************** -* Obsolete Functions -***************************************************/ -/* obsolete compression functions */ -int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) { return LZ4_compress_default(source, dest, inputSize, maxOutputSize); } -int LZ4_compress(const char* source, char* dest, int inputSize) { return LZ4_compress_default(source, dest, inputSize, LZ4_compressBound(inputSize)); } -int LZ4_compress_limitedOutput_withState (void* state, const char* src, char* dst, int srcSize, int dstSize) { return LZ4_compress_fast_extState(state, src, dst, srcSize, dstSize, 1); } -int LZ4_compress_withState (void* state, const char* src, char* dst, int srcSize) { return LZ4_compress_fast_extState(state, src, dst, srcSize, LZ4_compressBound(srcSize), 1); } -int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_stream, const char* src, char* dst, int srcSize, int maxDstSize) { return LZ4_compress_fast_continue(LZ4_stream, src, dst, srcSize, maxDstSize, 1); } -int LZ4_compress_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize) { return LZ4_compress_fast_continue(LZ4_stream, source, dest, inputSize, LZ4_compressBound(inputSize), 1); } - -/* -These function names are deprecated and should no longer be used. -They are only provided here for compatibility with older user programs. -- LZ4_uncompress is totally equivalent to LZ4_decompress_fast -- LZ4_uncompress_unknownOutputSize is totally equivalent to LZ4_decompress_safe -*/ -int LZ4_uncompress (const char* source, char* dest, int outputSize) { return LZ4_decompress_fast(source, dest, outputSize); } -int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize) { return LZ4_decompress_safe(source, dest, isize, maxOutputSize); } - - -/* Obsolete Streaming functions */ - -int LZ4_sizeofStreamState() { return LZ4_STREAMSIZE; } - -static void LZ4_init(LZ4_stream_t_internal* lz4ds, BYTE* base) -{ - MEM_INIT(lz4ds, 0, LZ4_STREAMSIZE); - lz4ds->bufferStart = base; -} - -int LZ4_resetStreamState(void* state, char* inputBuffer) -{ - if ((((size_t)state) & 3) != 0) return 1; /* Error : pointer is not aligned on 4-bytes boundary */ - LZ4_init((LZ4_stream_t_internal*)state, (BYTE*)inputBuffer); - return 0; -} - -void* LZ4_create (char* inputBuffer) -{ - void* lz4ds = ALLOCATOR(8, LZ4_STREAMSIZE_U64); - LZ4_init ((LZ4_stream_t_internal*)lz4ds, (BYTE*)inputBuffer); - return lz4ds; -} - -char* LZ4_slideInputBuffer (void* LZ4_Data) -{ - LZ4_stream_t_internal* ctx = (LZ4_stream_t_internal*)LZ4_Data; - int dictSize = LZ4_saveDict((LZ4_stream_t*)LZ4_Data, (char*)ctx->bufferStart, 64 KB); - return (char*)(ctx->bufferStart + dictSize); -} - -/* Obsolete streaming decompression functions */ - -int LZ4_decompress_safe_withPrefix64k(const char* source, char* dest, int compressedSize, int maxOutputSize) -{ - return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, withPrefix64k, (BYTE*)dest - 64 KB, NULL, 64 KB); -} - -int LZ4_decompress_fast_withPrefix64k(const char* source, char* dest, int originalSize) -{ - return LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, withPrefix64k, (BYTE*)dest - 64 KB, NULL, 64 KB); -} - -#endif /* LZ4_COMMONDEFS_ONLY */ diff --git a/ext/lz4.h b/ext/lz4.h deleted file mode 100644 index 20e3d48..0000000 --- a/ext/lz4.h +++ /dev/null @@ -1,361 +0,0 @@ -/* - LZ4 - Fast LZ compression algorithm - Header File - Copyright (C) 2011-2015, Yann Collet. - - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following disclaimer - in the documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - You can contact the author at : - - LZ4 source repository : https://github.com/Cyan4973/lz4 - - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c -*/ -#pragma once - -#if defined (__cplusplus) -extern "C" { -#endif - -/* - * lz4.h provides block compression functions, and gives full buffer control to programmer. - * If you need to generate inter-operable compressed data (respecting LZ4 frame specification), - * and can let the library handle its own memory, please use lz4frame.h instead. -*/ - -/************************************** -* Version -**************************************/ -#define LZ4_VERSION_MAJOR 1 /* for breaking interface changes */ -#define LZ4_VERSION_MINOR 7 /* for new (non-breaking) interface capabilities */ -#define LZ4_VERSION_RELEASE 0 /* for tweaks, bug-fixes, or development */ -#define LZ4_VERSION_NUMBER (LZ4_VERSION_MAJOR *100*100 + LZ4_VERSION_MINOR *100 + LZ4_VERSION_RELEASE) -int LZ4_versionNumber (void); - -/************************************** -* Tuning parameter -**************************************/ -/* - * LZ4_MEMORY_USAGE : - * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.) - * Increasing memory usage improves compression ratio - * Reduced memory usage can improve speed, due to cache effect - * Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache - */ -#define LZ4_MEMORY_USAGE 14 - - -/************************************** -* Simple Functions -**************************************/ - -int LZ4_compress_default(const char* source, char* dest, int sourceSize, int maxDestSize); -int LZ4_decompress_safe (const char* source, char* dest, int compressedSize, int maxDecompressedSize); - -/* -LZ4_compress_default() : - Compresses 'sourceSize' bytes from buffer 'source' - into already allocated 'dest' buffer of size 'maxDestSize'. - Compression is guaranteed to succeed if 'maxDestSize' >= LZ4_compressBound(sourceSize). - It also runs faster, so it's a recommended setting. - If the function cannot compress 'source' into a more limited 'dest' budget, - compression stops *immediately*, and the function result is zero. - As a consequence, 'dest' content is not valid. - This function never writes outside 'dest' buffer, nor read outside 'source' buffer. - sourceSize : Max supported value is LZ4_MAX_INPUT_VALUE - maxDestSize : full or partial size of buffer 'dest' (which must be already allocated) - return : the number of bytes written into buffer 'dest' (necessarily <= maxOutputSize) - or 0 if compression fails - -LZ4_decompress_safe() : - compressedSize : is the precise full size of the compressed block. - maxDecompressedSize : is the size of destination buffer, which must be already allocated. - return : the number of bytes decompressed into destination buffer (necessarily <= maxDecompressedSize) - If destination buffer is not large enough, decoding will stop and output an error code (<0). - If the source stream is detected malformed, the function will stop decoding and return a negative result. - This function is protected against buffer overflow exploits, including malicious data packets. - It never writes outside output buffer, nor reads outside input buffer. -*/ - - -/************************************** -* Advanced Functions -**************************************/ -#define LZ4_MAX_INPUT_SIZE 0x7E000000 /* 2 113 929 216 bytes */ -#define LZ4_COMPRESSBOUND(isize) ((unsigned)(isize) > (unsigned)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16) - -/* -LZ4_compressBound() : - Provides the maximum size that LZ4 compression may output in a "worst case" scenario (input data not compressible) - This function is primarily useful for memory allocation purposes (destination buffer size). - Macro LZ4_COMPRESSBOUND() is also provided for compilation-time evaluation (stack memory allocation for example). - Note that LZ4_compress_default() compress faster when dest buffer size is >= LZ4_compressBound(srcSize) - inputSize : max supported value is LZ4_MAX_INPUT_SIZE - return : maximum output size in a "worst case" scenario - or 0, if input size is too large ( > LZ4_MAX_INPUT_SIZE) -*/ -int LZ4_compressBound(int inputSize); - -/* -LZ4_compress_fast() : - Same as LZ4_compress_default(), but allows to select an "acceleration" factor. - The larger the acceleration value, the faster the algorithm, but also the lesser the compression. - It's a trade-off. It can be fine tuned, with each successive value providing roughly +~3% to speed. - An acceleration value of "0" means "use Default value" (see lz4.c) - An acceleration value of "1" is the same as regular LZ4_compress_default() -*/ -int LZ4_compress_fast (const char* source, char* dest, int sourceSize, int maxDestSize, int acceleration); - - -/* -LZ4_compress_fast_extState() : - Same compression function, just using an externally allocated memory space to store compression state. - Use LZ4_sizeofState() to know how much memory must be allocated, - and allocate it on 8-bytes boundaries (using malloc() typically). - Then, provide it as 'void* state' to compression function. -*/ -int LZ4_sizeofState(void); -int LZ4_compress_fast_extState (void* state, const char* source, char* dest, int inputSize, int maxDestSize, int acceleration); - - -/* -LZ4_compress_destSize() : - Reverse the logic, by compressing as much data as possible from 'source' buffer - into already allocated buffer 'dest' of size 'targetDestSize'. - This function either compresses the entire 'source' content into 'dest' if it's large enough, - or fill 'dest' buffer completely with as much data as possible from 'source'. - Original idea by WiredTiger team. - *sourceSizePtr : will be modified to indicate how many bytes where read from 'source' to fill 'dest'. - New value is necessarily <= old value. - return : Nb bytes written into 'dest' (necessarily <= targetDestSize) - or 0 if compression fails -*/ -int LZ4_compress_destSize (const char* source, char* dest, int* sourceSizePtr, int targetDestSize); - - -/* -LZ4_decompress_fast() : - originalSize : is the original and therefore uncompressed size - return : the number of bytes read from the source buffer (in other words, the compressed size) - If the source stream is detected malformed, the function will stop decoding and return a negative result. - Destination buffer must be already allocated. Its size must be a minimum of 'originalSize' bytes. - note : This function fully respect memory boundaries for properly formed compressed data. - It is a bit faster than LZ4_decompress_safe(). - However, it does not provide any protection against intentionally modified data stream (malicious input). - Use this function in trusted environment only (data to decode comes from a trusted source). -*/ -int LZ4_decompress_fast (const char* source, char* dest, int originalSize); - -/* -LZ4_decompress_safe_partial() : - This function decompress a compressed block of size 'compressedSize' at position 'source' - into destination buffer 'dest' of size 'maxDecompressedSize'. - The function tries to stop decompressing operation as soon as 'targetOutputSize' has been reached, - reducing decompression time. - return : the number of bytes decoded in the destination buffer (necessarily <= maxDecompressedSize) - Note : this number can be < 'targetOutputSize' should the compressed block to decode be smaller. - Always control how many bytes were decoded. - If the source stream is detected malformed, the function will stop decoding and return a negative result. - This function never writes outside of output buffer, and never reads outside of input buffer. It is therefore protected against malicious data packets -*/ -int LZ4_decompress_safe_partial (const char* source, char* dest, int compressedSize, int targetOutputSize, int maxDecompressedSize); - - -/*********************************************** -* Streaming Compression Functions -***********************************************/ -#define LZ4_STREAMSIZE_U64 ((1 << (LZ4_MEMORY_USAGE-3)) + 4) -#define LZ4_STREAMSIZE (LZ4_STREAMSIZE_U64 * sizeof(long long)) -/* - * LZ4_stream_t - * information structure to track an LZ4 stream. - * important : init this structure content before first use ! - * note : only allocated directly the structure if you are statically linking LZ4 - * If you are using liblz4 as a DLL, please use below construction methods instead. - */ -typedef struct { long long table[LZ4_STREAMSIZE_U64]; } LZ4_stream_t; - -/* - * LZ4_resetStream - * Use this function to init an allocated LZ4_stream_t structure - */ -void LZ4_resetStream (LZ4_stream_t* streamPtr); - -/* - * LZ4_createStream will allocate and initialize an LZ4_stream_t structure - * LZ4_freeStream releases its memory. - * In the context of a DLL (liblz4), please use these methods rather than the static struct. - * They are more future proof, in case of a change of LZ4_stream_t size. - */ -LZ4_stream_t* LZ4_createStream(void); -int LZ4_freeStream (LZ4_stream_t* streamPtr); - -/* - * LZ4_loadDict - * Use this function to load a static dictionary into LZ4_stream. - * Any previous data will be forgotten, only 'dictionary' will remain in memory. - * Loading a size of 0 is allowed. - * Return : dictionary size, in bytes (necessarily <= 64 KB) - */ -int LZ4_loadDict (LZ4_stream_t* streamPtr, const char* dictionary, int dictSize); - -/* - * LZ4_compress_fast_continue - * Compress buffer content 'src', using data from previously compressed blocks as dictionary to improve compression ratio. - * Important : Previous data blocks are assumed to still be present and unmodified ! - * 'dst' buffer must be already allocated. - * If maxDstSize >= LZ4_compressBound(srcSize), compression is guaranteed to succeed, and runs faster. - * If not, and if compressed data cannot fit into 'dst' buffer size, compression stops, and function returns a zero. - */ -int LZ4_compress_fast_continue (LZ4_stream_t* streamPtr, const char* src, char* dst, int srcSize, int maxDstSize, int acceleration); - -/* - * LZ4_saveDict - * If previously compressed data block is not guaranteed to remain available at its memory location - * save it into a safer place (char* safeBuffer) - * Note : you don't need to call LZ4_loadDict() afterwards, - * dictionary is immediately usable, you can therefore call LZ4_compress_fast_continue() - * Return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if error - */ -int LZ4_saveDict (LZ4_stream_t* streamPtr, char* safeBuffer, int dictSize); - - -/************************************************ -* Streaming Decompression Functions -************************************************/ - -#define LZ4_STREAMDECODESIZE_U64 4 -#define LZ4_STREAMDECODESIZE (LZ4_STREAMDECODESIZE_U64 * sizeof(unsigned long long)) -typedef struct { unsigned long long table[LZ4_STREAMDECODESIZE_U64]; } LZ4_streamDecode_t; -/* - * LZ4_streamDecode_t - * information structure to track an LZ4 stream. - * init this structure content using LZ4_setStreamDecode or memset() before first use ! - * - * In the context of a DLL (liblz4) please prefer usage of construction methods below. - * They are more future proof, in case of a change of LZ4_streamDecode_t size in the future. - * LZ4_createStreamDecode will allocate and initialize an LZ4_streamDecode_t structure - * LZ4_freeStreamDecode releases its memory. - */ -LZ4_streamDecode_t* LZ4_createStreamDecode(void); -int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream); - -/* - * LZ4_setStreamDecode - * Use this function to instruct where to find the dictionary. - * Setting a size of 0 is allowed (same effect as reset). - * Return : 1 if OK, 0 if error - */ -int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize); - -/* -*_continue() : - These decoding functions allow decompression of multiple blocks in "streaming" mode. - Previously decoded blocks *must* remain available at the memory position where they were decoded (up to 64 KB) - In the case of a ring buffers, decoding buffer must be either : - - Exactly same size as encoding buffer, with same update rule (block boundaries at same positions) - In which case, the decoding & encoding ring buffer can have any size, including very small ones ( < 64 KB). - - Larger than encoding buffer, by a minimum of maxBlockSize more bytes. - maxBlockSize is implementation dependent. It's the maximum size you intend to compress into a single block. - In which case, encoding and decoding buffers do not need to be synchronized, - and encoding ring buffer can have any size, including small ones ( < 64 KB). - - _At least_ 64 KB + 8 bytes + maxBlockSize. - In which case, encoding and decoding buffers do not need to be synchronized, - and encoding ring buffer can have any size, including larger than decoding buffer. - Whenever these conditions are not possible, save the last 64KB of decoded data into a safe buffer, - and indicate where it is saved using LZ4_setStreamDecode() -*/ -int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxDecompressedSize); -int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize); - - -/* -Advanced decoding functions : -*_usingDict() : - These decoding functions work the same as - a combination of LZ4_setStreamDecode() followed by LZ4_decompress_x_continue() - They are stand-alone. They don't need nor update an LZ4_streamDecode_t structure. -*/ -int LZ4_decompress_safe_usingDict (const char* source, char* dest, int compressedSize, int maxDecompressedSize, const char* dictStart, int dictSize); -int LZ4_decompress_fast_usingDict (const char* source, char* dest, int originalSize, const char* dictStart, int dictSize); - - - -/************************************** -* Obsolete Functions -**************************************/ -/* Deprecate Warnings */ -/* Should these warnings messages be a problem, - it is generally possible to disable them, - with -Wno-deprecated-declarations for gcc - or _CRT_SECURE_NO_WARNINGS in Visual for example. - You can also define LZ4_DEPRECATE_WARNING_DEFBLOCK. */ -#ifndef LZ4_DEPRECATE_WARNING_DEFBLOCK -# define LZ4_DEPRECATE_WARNING_DEFBLOCK -# define LZ4_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) -# if (LZ4_GCC_VERSION >= 405) || defined(__clang__) -# define LZ4_DEPRECATED(message) __attribute__((deprecated(message))) -# elif (LZ4_GCC_VERSION >= 301) -# define LZ4_DEPRECATED(message) __attribute__((deprecated)) -# elif defined(_MSC_VER) -# define LZ4_DEPRECATED(message) __declspec(deprecated(message)) -# else -# pragma message("WARNING: You need to implement LZ4_DEPRECATED for this compiler") -# define LZ4_DEPRECATED(message) -# endif -#endif /* LZ4_DEPRECATE_WARNING_DEFBLOCK */ - -/* Obsolete compression functions */ -/* These functions are planned to start generate warnings by r131 approximately */ -int LZ4_compress (const char* source, char* dest, int sourceSize); -int LZ4_compress_limitedOutput (const char* source, char* dest, int sourceSize, int maxOutputSize); -int LZ4_compress_withState (void* state, const char* source, char* dest, int inputSize); -int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize); -int LZ4_compress_continue (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize); -int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize, int maxOutputSize); - -/* Obsolete decompression functions */ -/* These function names are completely deprecated and must no longer be used. - They are only provided here for compatibility with older programs. - - LZ4_uncompress is the same as LZ4_decompress_fast - - LZ4_uncompress_unknownOutputSize is the same as LZ4_decompress_safe - These function prototypes are now disabled; uncomment them only if you really need them. - It is highly recommended to stop using these prototypes and migrate to maintained ones */ -/* int LZ4_uncompress (const char* source, char* dest, int outputSize); */ -/* int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize); */ - -/* Obsolete streaming functions; use new streaming interface whenever possible */ -LZ4_DEPRECATED("use LZ4_createStream() instead") void* LZ4_create (char* inputBuffer); -LZ4_DEPRECATED("use LZ4_createStream() instead") int LZ4_sizeofStreamState(void); -LZ4_DEPRECATED("use LZ4_resetStream() instead") int LZ4_resetStreamState(void* state, char* inputBuffer); -LZ4_DEPRECATED("use LZ4_saveDict() instead") char* LZ4_slideInputBuffer (void* state); - -/* Obsolete streaming decoding functions */ -LZ4_DEPRECATED("use LZ4_decompress_safe_usingDict() instead") int LZ4_decompress_safe_withPrefix64k (const char* src, char* dst, int compressedSize, int maxDstSize); -LZ4_DEPRECATED("use LZ4_decompress_fast_usingDict() instead") int LZ4_decompress_fast_withPrefix64k (const char* src, char* dst, int originalSize); - - -#if defined (__cplusplus) -} -#endif diff --git a/ext/polycom/optp4.c b/ext/polycom/optp4.c new file mode 100644 index 0000000..414985e --- /dev/null +++ b/ext/polycom/optp4.c @@ -0,0 +1,22 @@ +#include "../OPT_PFD/opt_p4.h" // OptPFD + +unsigned char *optpfdenc32(unsigned *__restrict in, int n, unsigned *__restrict out) { + if(n < 128) + out = vbyteenc(in, n, (unsigned *)out); + else { + unsigned tmp[OPTPFDMAX]; + for(i = 0; i < n; i++) tmp[i] = in[i]; + return out += OPT4(tmp, n, (unsigned *)out); + } + return out; +} + +unsigned char *optpfddec32(unsigned *__restrict in, int n, unsigned *__restrict out) { + if(n < 128) + in = vbytedec(in, n, out); + else { + unsigned all_array[OPTPFDMAX]; + return (unsigned char *)detailed_p4_decode(out, (unsigned *)in, all_array); + } +} + diff --git a/ext/polycom/optp4.h b/ext/polycom/optp4.h new file mode 100644 index 0000000..651513f --- /dev/null +++ b/ext/polycom/optp4.h @@ -0,0 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif +#define OPTPFDMAX 2048 +unsigned char *optpfdenc32(unsigned *__restrict in, int n, unsigned *__restrict out); +unsigned char *optpfddec32(unsigned *__restrict in, int n, unsigned *__restrict out); + +#ifdef __cplusplus +} +#endif + diff --git a/ext/polycom/optpfd.c b/ext/polycom/optpfd.c new file mode 100644 index 0000000..84a4ac5 --- /dev/null +++ b/ext/polycom/optpfd.c @@ -0,0 +1,26 @@ +#include +#include "../OPT_PFD/opt_p4.h" // OptPFD + +#include "optpfd.h" +#include "polyvbyte.h" +unsigned char *optpfdenc32(unsigned *in, int n, unsigned char *out) { + if(n < 128) + out = vbpolyenc(in, n, out); + else { + unsigned tmp[OPTPFDMAX],i; + for(i = 0; i < n; i++) tmp[i] = in[i]; + return out += OPT4(tmp, n, (unsigned *)out); + } + return out; +} + +unsigned char *optpfddec32(unsigned char *in, int n, unsigned *out) { + if(n < 128) + in = vbpolydec(in, n, out); + else { + unsigned all_array[OPTPFDMAX]; + in = (unsigned char *)detailed_p4_decode(out, (unsigned *)in, all_array); + } + return in; +} + diff --git a/ext/polycom/optpfd.h b/ext/polycom/optpfd.h new file mode 100644 index 0000000..9ff838e --- /dev/null +++ b/ext/polycom/optpfd.h @@ -0,0 +1,11 @@ +#ifdef __cplusplus +extern "C" { +#endif +#define OPTPFDMAX 2048 +unsigned char *optpfdenc32(unsigned *in, int n, unsigned char *out); +unsigned char *optpfddec32(unsigned char *in, int n, unsigned *out); + +#ifdef __cplusplus +} +#endif + diff --git a/ext/polycom/polyvbyte.c b/ext/polycom/polyvbyte.c new file mode 100644 index 0000000..480180c --- /dev/null +++ b/ext/polycom/polyvbyte.c @@ -0,0 +1,14 @@ +#include "vbyte_poly.h" +#include "polyvbyte.h" + +unsigned char *vbpolyenc(unsigned *in, unsigned n, unsigned char *out) { + unsigned i; + for(i = 0; i < n; i++) { unsigned x = in[i]; VBYTE_ENC(out, x); } + return out; +} +unsigned char *vbpolydec(unsigned char *in, unsigned n, unsigned *out) { + unsigned i; + for(i = 0; i < n; i++) { unsigned x; VBYTE_DEC(in, x); out[i] = x; } + return in; +} + diff --git a/ext/polycom/polyvbyte.h b/ext/polycom/polyvbyte.h new file mode 100644 index 0000000..f8b3a99 --- /dev/null +++ b/ext/polycom/polyvbyte.h @@ -0,0 +1,10 @@ +#ifdef __cplusplus +extern "C" { +#endif +unsigned char *vbpolyenc(unsigned *in, unsigned n, unsigned char *out); +unsigned char *vbpolydec(unsigned char *in, unsigned n, unsigned *out); +#ifdef __cplusplus +} +#endif + + diff --git a/ext/vbyte_poly.h b/ext/polycom/vbyte_poly.h similarity index 100% rename from ext/vbyte_poly.h rename to ext/polycom/vbyte_poly.h diff --git a/ext/qmx/GNUmakefile b/ext/qmx/GNUmakefile deleted file mode 100644 index cbe112e..0000000 --- a/ext/qmx/GNUmakefile +++ /dev/null @@ -1,10 +0,0 @@ -# -# OS X and Linux Makefile -# - -compress_qmx : - g++ -O3 -msse4 compress_qmx.c -o compress_qmx - -clean : - rm compress_qmx - diff --git a/ext/qmx/README b/ext/qmx/README deleted file mode 100644 index 129cf75..0000000 --- a/ext/qmx/README +++ /dev/null @@ -1,16 +0,0 @@ -QMX README ----------- -The source is released under the BSD license (you choose which one). - -See (and please cite), in the ACM Digital Library (and on my website): - -A. Trotman (2014), Compression, SIMD, and Postings Lists. In Proceedings of the 19th Australasian Document Computing Symposium (ADCS 2014) - -One C++ class is provided. It compiles and runs on Linux, OS X, and Windows. Use make to build the executable that compresses and decompressed one string (and checks that the code works). - -IMPORTANT NOTE --------------- -As QMX decodes in "chunks", it can (i.e. will normally) decode more integers than requested. In other words, it will normally overflow the output buffer. Allowing for 256 "extras" will suffice. These extras will be garbage. Although it is possible to encode to prevent (much) "junk", in this implementation the decision was made to favour smaller compressed size and the consequence is more junk decoded. - -Andrew - diff --git a/ext/qmx/compress_qmx.h b/ext/qmx/compress_qmx.h deleted file mode 100644 index 0b29915..0000000 --- a/ext/qmx/compress_qmx.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - COMPRESS_QMX.H - -------------- -*/ -#ifndef COMPRESS_QMX_H_ -#define COMPRESS_QMX_H_ - -#include - - -#ifdef __cplusplus -extern "C" { -#endif - -unsigned char *qmx_enc( const uint32_t *in, unsigned n, unsigned char *out); -unsigned char *qmx_dec(const unsigned char *in, unsigned len, uint32_t *out, unsigned n); - -#ifdef __cplusplus -} -#endif -#endif - diff --git a/ext/qmx/makefile b/ext/qmx/makefile deleted file mode 100644 index 47c00c2..0000000 --- a/ext/qmx/makefile +++ /dev/null @@ -1,10 +0,0 @@ -# -# Windows Makefile -# - -compress_qmx.exe : - cl /Ox /Tp compress_qmx.c - -clean : - del compress_qmx.obj compress_qmx.exe - diff --git a/ext/rc.c b/ext/rc.c new file mode 100644 index 0000000..b46599c --- /dev/null +++ b/ext/rc.c @@ -0,0 +1,1811 @@ +// Copyright (c) 2008, WEST, Polytechnic Institute of NYU. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of WEST, Polytechnic Institute of NYU. nor the names +// of its contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: Torsten Suel, Jiangong Zhang, Jinru He +// +// If you have any questions or problems about our codes, please contact: +// jhe@cis.poly.edu +// +// + +//#include "rice_coding2.h" +//#include + +/*rc_rice_coding2() { + // TODO Auto-generated constructor stub + cnum[0] = 0; + cnum[1] = 1; + cnum[2] = 2; + cnum[3] = 3; + cnum[4] = 4; + cnum[5] = 5; + cnum[6] = 6; + cnum[7] = 7; + cnum[8] = 8; + cnum[9] = 9; + cnum[10] = 10; + cnum[11] = 11; + cnum[12] = 12; + cnum[13] = 13; + cnum[14] = 16; + cnum[15] = 20; + cnum[16] = 32; +}*/ +#define coding_type 3 +#define block_size 128 + +static int cnum[] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,16,20,32 }; + + +/*rc_~rice_coding2() { + // TODO Auto-generated destructor stub +}*/ + +/*int rc_get_type() +{ + return coding_type; +} + +void rc_set_size(int size) +{ + this->block_size = size; +}*/ +//void pack(unsigned int *v, unsigned int b, unsigned int n, unsigned int *w); +#include "../bitpack.h" +#include "../bitunpack.h" +#include "rc.h" + + void setBit(unsigned char *buf, unsigned int *bp, unsigned int val) + { + unsigned int bPtr; + unsigned int w; + + bPtr = (*bp)&7; + if (bPtr == 0) buf[(*bp)>>3] = 0; + if (val == 1) buf[(*bp)>>3] |= (1< 0) + { + s = ((bits * block_size)>>5); + for (i = 0; i < s; i++) w[i] = 0; + for (i = 0; i < block_size; i++) out[i] = (*buf)[i] & ((1u<>bits); val > 0; val--) + setBit((unsigned char *)(w), &bp, 1); + + setBit((unsigned char *)(w), &bp, 0); + } + w += (bp>>5); + if (bp&31) w += 1; + } + *buf += block_size; + return w; +} + +unsigned char *rcenc32(unsigned* input, int size, unsigned* output) +{ + int i,s, f = 0; + int fres; + unsigned int bb = 0; + unsigned int bp; + int m = 0; + //block_size = size; + for (i = 0, bb = 0; i < block_size; i++) + { + //printf("%d\n", input[i]); + bb += input[i]; + } + + bb = 100 * (bb / block_size) / 100; + //printf("bb=%d, ", bb); + int b; + for (b = 0; bb > 0; bb = (bb>>1)) b++; + + + + if (b > 0) b--; + + //cout << b << endl; + for (fres = 0; cnum[fres] < b; fres++);//{printf("%d, %d\n", cnum[fres], fres);} + + b = cnum[fres]; + //printf("b:%d, f:%d\n", b,fres); + + unsigned* tmp = input; + unsigned* tmp2 = output+1; + tmp2 = rc_turbo_rice_encode(tmp2, &tmp, b); + *output = (unsigned)((b<<6)|fres); + + return tmp2;// - output; +} + +unsigned char *rc_turbo_rice_decode(unsigned int *w, unsigned int *buf, + unsigned int bits, unsigned int flag) + +{ + unsigned int i; + unsigned char b; + unsigned char *ww; + unsigned int val; + + //(unpack[flag])(buf, *w, block_size); + w = bitunpack32(w, block_size, (unsigned char *)buf, flag); + //*w += ((bits * block_size)>>5); + +if (bits < 32) +{ + i = 0; + ww = (unsigned char *)(w); + val = 1<>2); + if (i & 3) w += 1; +} + return w; +} + +unsigned char *rcdec32(unsigned* input, int size, unsigned* output) +{ + unsigned* tmp = input; + int flag = (*tmp)&0x3f; + int b = ((*tmp)>>6); + tmp++; + return rc_turbo_rice_decode(tmp, output, b, flag); + //return tmp; // - input; + +} diff --git a/ext/rc.h b/ext/rc.h new file mode 100644 index 0000000..05f66ac --- /dev/null +++ b/ext/rc.h @@ -0,0 +1,8 @@ +#ifdef __cplusplus +extern "C" { +#endif +unsigned char *rcenc32(unsigned* input, int size, unsigned* output); +unsigned char *rcdec32(unsigned* input, int size, unsigned* output); +#ifdef __cplusplus +} +#endif diff --git a/ext/simdcomp b/ext/simdcomp new file mode 160000 index 0000000..6757770 --- /dev/null +++ b/ext/simdcomp @@ -0,0 +1 @@ +Subproject commit 6757770723e5c310e18661033deb6b20a17cc62d diff --git a/ext/simdcomp/bitpacka.c b/ext/simdcomp/bitpacka.c deleted file mode 100644 index 9a09747..0000000 --- a/ext/simdcomp/bitpacka.c +++ /dev/null @@ -1,17774 +0,0 @@ -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wreturn-type" -#include "bitpacka.h" -#define INLINE inline -uint32_t * nullpacker(const uint32_t * __restrict in, uint32_t * __restrict out) { - return out; -} - - const uint32_t * nullunpacker8(const uint32_t * __restrict in, uint32_t * __restrict out) { - memset(out,0,8 * 4); - return in; - } - - - uint32_t * __fastpackwithoutmask1_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in++) ; - *out |= ( (*in++) ) << 1 ; - *out |= ( (*in++) ) << 2 ; - *out |= ( (*in++) ) << 3 ; - *out |= ( (*in++) ) << 4 ; - *out |= ( (*in++) ) << 5 ; - *out |= ( (*in++) ) << 6 ; - *out |= ( (*in++) ) << 7 ; - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask2_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in++) ; - *out |= ( (*in++) ) << 2 ; - *out |= ( (*in++) ) << 4 ; - *out |= ( (*in++) ) << 6 ; - *out |= ( (*in++) ) << 8 ; - *out |= ( (*in++) ) << 10 ; - *out |= ( (*in++) ) << 12 ; - *out |= ( (*in++) ) << 14 ; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask3_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 9 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 15 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++in; - *out |= ( (*in) ) << 21 ; - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask4_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask5_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 5 ; - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 15 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 25 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 5 - 3 ); - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask6_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 6 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask7_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 7 ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 21 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 7 - 3 ); - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 17 ; - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask8_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask9_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 9 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - *out = ( (*in) ) >> ( 9 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 13 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 9 - 8 ); - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask10_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 10 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 10 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask11_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 11 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 11 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 23 ; - ++out; - *out = ( (*in) ) >> ( 11 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 13 ; - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask12_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 12 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 12 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask13_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 13 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 13 - 7 ); - ++in; - *out |= ( (*in) ) << 7 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 13 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - *out = ( (*in) ) >> ( 13 - 8 ); - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask14_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 14 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 14 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 14 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask15_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 15 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 15 - 13 ); - ++in; - *out |= ( (*in) ) << 13 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 15 - 11 ); - ++in; - *out |= ( (*in) ) << 11 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 15 - 9 ); - ++in; - *out |= ( (*in) ) << 9 ; - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask16_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask17_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 17 ; - ++out; - *out = ( (*in) ) >> ( 17 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 19 ; - ++out; - *out = ( (*in) ) >> ( 17 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 21 ; - ++out; - *out = ( (*in) ) >> ( 17 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 23 ; - ++out; - *out = ( (*in) ) >> ( 17 - 8 ); - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask18_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 18 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 18 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 18 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 18 - 16 ); - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask19_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 19 ; - ++out; - *out = ( (*in) ) >> ( 19 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 25 ; - ++out; - *out = ( (*in) ) >> ( 19 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 19 - 18 ); - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 19 - 5 ); - ++in; - *out |= ( (*in) ) << 5 ; - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask20_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 20 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 20 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 20 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 20 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask21_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 21 ; - ++out; - *out = ( (*in) ) >> ( 21 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 21 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 21 - 9 ); - ++in; - *out |= ( (*in) ) << 9 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 21 - 19 ); - ++in; - *out |= ( (*in) ) << 19 ; - ++out; - *out = ( (*in) ) >> ( 21 - 8 ); - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask22_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 22 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 22 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 22 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - *out = ( (*in) ) >> ( 22 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 22 - 16 ); - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask23_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 23 ; - ++out; - *out = ( (*in) ) >> ( 23 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - *out = ( (*in) ) >> ( 23 - 5 ); - ++in; - *out |= ( (*in) ) << 5 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 23 - 19 ); - ++in; - *out |= ( (*in) ) << 19 ; - ++out; - *out = ( (*in) ) >> ( 23 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++out; - *out = ( (*in) ) >> ( 23 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask24_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 24 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 24 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 24 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 24 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask25_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 25 ; - ++out; - *out = ( (*in) ) >> ( 25 - 18 ); - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 25 - 11 ); - ++in; - *out |= ( (*in) ) << 11 ; - ++out; - *out = ( (*in) ) >> ( 25 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 25 - 22 ); - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 25 - 15 ); - ++in; - *out |= ( (*in) ) << 15 ; - ++out; - *out = ( (*in) ) >> ( 25 - 8 ); - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask26_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 26 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 26 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - *out = ( (*in) ) >> ( 26 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - *out = ( (*in) ) >> ( 26 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 26 - 22 ); - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 26 - 16 ); - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask27_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - *out = ( (*in) ) >> ( 27 - 22 ); - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 27 - 17 ); - ++in; - *out |= ( (*in) ) << 17 ; - ++out; - *out = ( (*in) ) >> ( 27 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 27 - 7 ); - ++in; - *out |= ( (*in) ) << 7 ; - ++out; - *out = ( (*in) ) >> ( 27 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 27 - 24 ); - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask28_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 28 - 24 ); - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 28 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 28 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 28 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 28 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - *out = ( (*in) ) >> ( 28 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask29_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 29 - 26 ); - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 29 - 23 ); - ++in; - *out |= ( (*in) ) << 23 ; - ++out; - *out = ( (*in) ) >> ( 29 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 29 - 17 ); - ++in; - *out |= ( (*in) ) << 17 ; - ++out; - *out = ( (*in) ) >> ( 29 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - *out = ( (*in) ) >> ( 29 - 11 ); - ++in; - *out |= ( (*in) ) << 11 ; - ++out; - *out = ( (*in) ) >> ( 29 - 8 ); - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask30_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 30 - 28 ); - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 30 - 26 ); - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 30 - 24 ); - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 30 - 22 ); - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 30 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 30 - 18 ); - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 30 - 16 ); - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask31_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 31 - 30 ); - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 31 - 29 ); - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 31 - 28 ); - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 31 - 27 ); - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - *out = ( (*in) ) >> ( 31 - 26 ); - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 31 - 25 ); - ++in; - *out |= ( (*in) ) << 25 ; - ++out; - *out = ( (*in) ) >> ( 31 - 24 ); - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask32_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - - return out; - } - -#if 0 -#define OUTI(__x) *out++ -#define OUT(__x) *out -#define OUI out++ -#else -#define OUTI(__x) out[__x] -#define OUT(__x) out[__x] -#define OUI -#endif -const INLINE uint32_t * __fastunpack1_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - OUTI( 0) = ( (*in) >> 0 ) & 1; - OUTI( 1) = ( (*in) >> 1 ) & 1; - OUTI( 2) = ( (*in) >> 2 ) & 1; - OUTI( 3) = ( (*in) >> 3 ) & 1; - OUTI( 4) = ( (*in) >> 4 ) & 1; - OUTI( 5) = ( (*in) >> 5 ) & 1; - OUTI( 6) = ( (*in) >> 6 ) & 1; - OUTI( 7) = ( (*in) >> 7 ) & 1; - return in + 1; -} - -const INLINE uint32_t * __fastunpack2_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - OUTI( 0) = ( (*in) >> 0 ) % (1U << 2 ) ; - OUTI( 1) = ( (*in) >> 2 ) % (1U << 2 ) ; - OUTI( 2) = ( (*in) >> 4 ) % (1U << 2 ) ; - OUTI( 3) = ( (*in) >> 6 ) % (1U << 2 ) ; - OUTI( 4) = ( (*in) >> 8 ) % (1U << 2 ) ; - OUTI( 5) = ( (*in) >> 10 ) % (1U << 2 ) ; - OUTI( 6) = ( (*in) >> 12 ) % (1U << 2 ) ; - OUTI( 7) = ( (*in) >> 14 ) % (1U << 2 ) ; - return in + 1; -} - -const INLINE uint32_t * __fastunpack3_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - OUTI( 0) = ( (*in) >> 0 ) % (1U << 3 ) ; - OUTI( 1) = ( (*in) >> 3 ) % (1U << 3 ) ; - OUTI( 2) = ( (*in) >> 6 ) % (1U << 3 ) ; - OUTI( 3) = ( (*in) >> 9 ) % (1U << 3 ) ; - OUTI( 4) = ( (*in) >> 12 ) % (1U << 3 ) ; - OUTI( 5) = ( (*in) >> 15 ) % (1U << 3 ) ; - OUTI( 6) = ( (*in) >> 18 ) % (1U << 3 ) ; - OUTI( 7) = ( (*in) >> 21 ) % (1U << 3 ) ; - return in + 1; -} - -const INLINE uint32_t * __fastunpack4_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - OUTI( 0) = ( (*in) >> 0 ) % (1U << 4 ) ; - OUTI( 1) = ( (*in) >> 4 ) % (1U << 4 ) ; - OUTI( 2) = ( (*in) >> 8 ) % (1U << 4 ) ; - OUTI( 3) = ( (*in) >> 12 ) % (1U << 4 ) ; - OUTI( 4) = ( (*in) >> 16 ) % (1U << 4 ) ; - OUTI( 5) = ( (*in) >> 20 ) % (1U << 4 ) ; - OUTI( 6) = ( (*in) >> 24 ) % (1U << 4 ) ; - OUTI( 7) = ( (*in++) >> 28 ) ; - return in; -} - -const uint32_t * __fastunpack5_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - OUTI( 0) = ( (*in) >> 0 ) % (1U << 5 ) ; - OUTI( 1) = ( (*in) >> 5 ) % (1U << 5 ) ; - OUTI( 2) = ( (*in) >> 10 ) % (1U << 5 ) ; - OUTI( 3) = ( (*in) >> 15 ) % (1U << 5 ) ; - OUTI( 4) = ( (*in) >> 20 ) % (1U << 5 ) ; - OUTI( 5) = ( (*in) >> 25 ) % (1U << 5 ) ; - OUT( 6) = ( (*in++) >> 30 ) ; - OUT( 6) |= ((*in) % (1U<< 3 ))<<( 5 - 3 ); - OUI; - OUTI( 7) = ( (*in) >> 3 ) % (1U << 5 ) ; - return in + 1; -} - -const INLINE uint32_t * __fastunpack6_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - OUTI( 0) = ( (*in) >> 0 ) % (1U << 6 ) ; - OUTI( 1) = ( (*in) >> 6 ) % (1U << 6 ) ; - OUTI( 2) = ( (*in) >> 12 ) % (1U << 6 ) ; - OUTI( 3) = ( (*in) >> 18 ) % (1U << 6 ) ; - OUTI( 4) = ( (*in) >> 24 ) % (1U << 6 ) ; - OUT( 5) = ( (*in++) >> 30 ) ; - OUT( 5) |= ((*in) % (1U<< 4 ))<<( 6 - 4 ); - OUI; - OUTI( 6) = ( (*in) >> 4 ) % (1U << 6 ) ; - OUTI( 7) = ( (*in) >> 10 ) % (1U << 6 ) ; - return in + 1; -} - -const INLINE uint32_t * __fastunpack7_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - OUTI( 0) = ( (*in) >> 0 ) % (1U << 7 ) ; - OUTI( 1) = ( (*in) >> 7 ) % (1U << 7 ) ; - OUTI( 2) = ( (*in) >> 14 ) % (1U << 7 ) ; - OUTI( 3) = ( (*in) >> 21 ) % (1U << 7 ) ; - OUT( 4) = ( (*in++) >> 28 ) ; - OUT( 4) |= ((*in) % (1U<< 3 ))<<( 7 - 3 ); - OUI; - OUTI( 5) = ( (*in) >> 3 ) % (1U << 7 ) ; - OUTI( 6 ) = ( (*in) >> 10 ) % (1U << 7 ) ; - OUTI( 7 ) = ( (*in) >> 17 ) % (1U << 7 ) ; - return in + 1; -} - -const INLINE uint32_t * __fastunpack8_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - OUTI( 0) = ( (*in) >> 0 ) % (1U << 8 ) ; - OUTI( 1) = ( (*in) >> 8 ) % (1U << 8 ) ; - OUTI( 2) = ( (*in) >> 16 ) % (1U << 8 ) ; - OUTI( 3) = ( (*in++) >> 24 ) ; - OUTI( 4) = ( (*in) >> 0 ) % (1U << 8 ) ; - OUTI( 5) = ( (*in) >> 8 ) % (1U << 8 ) ; - OUTI( 6) = ( (*in) >> 16 ) % (1U << 8 ) ; - OUTI( 7) = ( (*in++) >> 24 ) ; - return in; -} - -const INLINE uint32_t * __fastunpack9_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - OUTI( 0) = ( (*in) >> 0 ) % (1U << 9 ) ; - OUTI( 1) = ( (*in) >> 9 ) % (1U << 9 ) ; - OUTI( 2) = ( (*in) >> 18 ) % (1U << 9 ) ; - OUT( 3) = ( (*in++) >> 27 ) ; - OUT( 3) |= ((*in) % (1U<< 4 ))<<( 9 - 4 ); - OUI; - OUTI( 4) = ( (*in) >> 4 ) % (1U << 9 ) ; - OUTI( 5) = ( (*in) >> 13 ) % (1U << 9 ) ; - OUTI( 6) = ( (*in) >> 22 ) % (1U << 9 ) ; - OUT( 7) = ( (*in++) >> 31 ) ; - OUT( 7) |= ((*in) % (1U<< 8 ))<<( 9 - 8 ); - OUI; - return in + 1; -} - -const INLINE uint32_t * __fastunpack10_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - OUTI( 0) = ( (*in) >> 0 ) % (1U << 10 ) ; - OUTI( 1) = ( (*in) >> 10 ) % (1U << 10 ) ; - OUTI( 2) = ( (*in) >> 20 ) % (1U << 10 ) ; - OUT( 3) = ( (*in++) >> 30 ) ; - OUT( 3) |= ((*in) % (1U<< 8 ))<<( 10 - 8 ); - OUI; - OUTI( 4) = ( (*in) >> 8 ) % (1U << 10 ) ; - OUTI( 5) = ( (*in) >> 18 ) % (1U << 10 ) ; - OUT( 6) = ( (*in++) >> 28 ) ; - OUT( 6) |= ((*in) % (1U<< 6 ))<<( 10 - 6 ); - OUI; - OUTI( 7) = ( (*in) >> 6 ) % (1U << 10 ) ; - return in + 1; -} - -const INLINE uint32_t * __fastunpack11_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - OUTI( 0) = ((*in) >> 0 ) % (1U << 11 ) ; - OUTI( 1) = ((*in) >> 11 ) % (1U << 11 ) ; - OUT( 2) = ((*in++) >> 22 ) ; - OUT( 2) |= ((*in) % (1U<< 1 ))<<( 11 - 1 ); - OUI; - OUTI( 3) = ( (*in) >> 1 ) % (1U << 11 ) ; - OUTI( 4) = ((*in) >> 12 ) % (1U << 11 ) ; - OUT( 5) = (*in++) >> 23; - OUT( 5) |= ((*in) % (1U<< 2 ))<<( 11 - 2 ); - OUI; - OUTI( 6) = ((*in) >> 2 ) % (1U << 11 ) ; - OUTI( 7) = ((*in) >> 13 ) % (1U << 11 ) ; - return in + 1; -} - -const INLINE uint32_t * __fastunpack12_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - OUTI( 0) = ( (*in) >> 0 ) % (1U << 12 ) ; - OUTI( 1) = ( (*in) >> 12 ) % (1U << 12 ) ; - OUT( 2) = ( (*in++) >> 24 ) ; - OUT( 2) |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); - OUI; - OUTI( 3) = ( (*in) >> 4 ) % (1U << 12 ) ; - OUTI( 4) = ( (*in) >> 16 ) % (1U << 12 ) ; - OUT( 5) = ( (*in++) >> 28 ) ; - OUT( 5) |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); - OUI; - OUTI( 6) = ( (*in) >> 8 ) % (1U << 12 ) ; - OUTI( 7) = ( (*in++) >> 20 ) ; - return in; -} - -const INLINE uint32_t * __fastunpack13_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - OUTI( 0) = ( (*in) >> 0 ) % (1U << 13 ) ; - OUTI( 1) = ( (*in) >> 13 ) % (1U << 13 ) ; - OUT( 2) = ( (*in++) >> 26 ) ; - OUT( 2) |= ((*in) % (1U<< 7 ))<<( 13 - 7 ); - OUI; - OUTI( 3) = ( (*in) >> 7 ) % (1U << 13 ) ; - OUT( 4) = ( (*in++) >> 20 ) ; - OUT( 4) |= ((*in) % (1U<< 1 ))<<( 13 - 1 ); - OUI; - OUTI( 5) = ( (*in) >> 1 ) % (1U << 13 ) ; - OUTI( 6) = ( (*in) >> 14 ) % (1U << 13 ) ; - OUT( 7) = ( (*in++) >> 27 ); - OUT( 7) |= ((*in) % (1U<< 8 ))<<( 13 - 8 ); - OUI; - return in + 1; -} - -const INLINE uint32_t * __fastunpack14_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - *out++ = ( (*in) >> 0 ) % (1U << 14 ) ; - *out++ = ( (*in) >> 14 ) % (1U << 14 ) ; - *out = ( (*in++) >> 28 ) ; - *out |= ((*in) % (1U<< 10 ))<<( 14 - 10 ); - out++; - *out++ = ( (*in) >> 10 ) % (1U << 14 ) ; - *out = ( (*in++) >> 24 ) ; - *out |= ((*in) % (1U<< 6 ))<<( 14 - 6 ); - out++; - *out++ = ( (*in) >> 6 ) % (1U << 14 ) ; - *out = ( (*in++) >> 20 ) ; - *out |= ((*in) % (1U<< 2 ))<<( 14 - 2 ); - out++; - *out++ = ( (*in) >> 2 ) % (1U << 14 ) ; - return in + 1; -} - -const INLINE uint32_t * __fastunpack15_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 15 ) ; - out++; - *out = ( (*in) >> 15 ) % (1U << 15 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 13 ))<<( 15 - 13 ); - out++; - *out = ( (*in) >> 13 ) % (1U << 15 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 11 ))<<( 15 - 11 ); - out++; - *out = ( (*in) >> 11 ) % (1U << 15 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 9 ))<<( 15 - 9 ); - out++; - *out = ( (*in) >> 9 ) % (1U << 15 ) ; - out++; - - return in + 1; - } - - - - -const INLINE uint32_t * __fastunpack16_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - out++; - - return in; - } - - - - -const INLINE uint32_t * __fastunpack17_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 17 ) ; - out++; - *out = ( (*in) >> 17 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 17 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 17 ) ; - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 17 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 17 ) ; - out++; - *out = ( (*in) >> 21 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 17 - 6 ); - out++; - *out = ( (*in) >> 6 ) % (1U << 17 ) ; - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 17 - 8 ); - out++; - - return in + 1; - } - - - - -const INLINE uint32_t * __fastunpack18_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 ); - out++; - *out = ( (*in) >> 12 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 ); - out++; - - return in + 1; - } - - - - -const INLINE uint32_t * __fastunpack19_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 19 ) ; - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 19 - 6 ); - out++; - *out = ( (*in) >> 6 ) % (1U << 19 ) ; - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 19 - 12 ); - out++; - *out = ( (*in) >> 12 ) % (1U << 19 ) ; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 19 - 18 ); - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 19 - 5 ); - out++; - *out = ( (*in) >> 5 ) % (1U << 19 ) ; - out++; - - return in + 1; - } - - - - -const INLINE uint32_t * __fastunpack20_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 20 ) ; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 20 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 20 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - out++; - - return in; - } - - - - -const INLINE uint32_t * __fastunpack21_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 21 ) ; - out++; - *out = ( (*in) >> 21 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 21 - 10 ); - out++; - *out = ( (*in) >> 10 ) % (1U << 21 ) ; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 21 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 9 ))<<( 21 - 9 ); - out++; - *out = ( (*in) >> 9 ) % (1U << 21 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 19 ))<<( 21 - 19 ); - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 21 - 8 ); - out++; - - return in + 1; - } - - - - -const INLINE uint32_t * __fastunpack22_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 22 ) ; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 22 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 22 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 ); - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack23_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 23 ) ; - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 23 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 23 - 5 ); - out++; - *out = ( (*in) >> 5 ) % (1U << 23 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 19 ))<<( 23 - 19 ); - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 23 - 10 ); - out++; - *out = ( (*in) >> 10 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 23 - 1 ); - out++; - *out = ( (*in) >> 1 ) % (1U << 23 ) ; - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack24_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 24 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 24 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack25_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 25 ) ; - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 25 - 18 ); - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 11 ))<<( 25 - 11 ); - out++; - *out = ( (*in) >> 11 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 25 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 25 ) ; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 25 - 22 ); - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 15 ))<<( 25 - 15 ); - out++; - *out = ( (*in) >> 15 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 25 - 8 ); - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack26_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 26 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 26 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 ); - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 ); - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack27_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 27 ) ; - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 27 - 22 ); - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 17 ))<<( 27 - 17 ); - out++; - *out = ( (*in) >> 17 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 27 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 7 ))<<( 27 - 7 ); - out++; - *out = ( (*in) >> 7 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 27 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 27 ) ; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 27 - 24 ); - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack28_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 28 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); - out++; - *out = ( (*in) >> 4 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack29_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 29 ) ; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 26 ))<<( 29 - 26 ); - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 23 ))<<( 29 - 23 ); - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 29 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 17 ))<<( 29 - 17 ); - out++; - *out = ( (*in) >> 17 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 29 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 11 ))<<( 29 - 11 ); - out++; - *out = ( (*in) >> 11 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 29 - 8 ); - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack30_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 30 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 ); - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 ); - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 ); - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 ); - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 ); - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 ); - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack31_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 31 ) ; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 30 ))<<( 31 - 30 ); - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 29 ))<<( 31 - 29 ); - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 28 ))<<( 31 - 28 ); - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 27 ))<<( 31 - 27 ); - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 26 ))<<( 31 - 26 ); - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 25 ))<<( 31 - 25 ); - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 31 - 24 ); - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack32_8(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - - return in; - } - - - - const uint32_t * fastunpack_8(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit) { - switch(bit) { - case 0: return nullunpacker8(in,out); - - case 1: - return __fastunpack1_8(in,out); - - case 2: - return __fastunpack2_8(in,out); - - case 3: - return __fastunpack3_8(in,out); - - case 4: - return __fastunpack4_8(in,out); - - case 5: - return __fastunpack5_8(in,out); - - case 6: - return __fastunpack6_8(in,out); - - case 7: - return __fastunpack7_8(in,out); - - case 8: - return __fastunpack8_8(in,out); - - case 9: - return __fastunpack9_8(in,out); - - case 10: - return __fastunpack10_8(in,out); - - case 11: - return __fastunpack11_8(in,out); - - case 12: - return __fastunpack12_8(in,out); - - case 13: - return __fastunpack13_8(in,out); - - case 14: - return __fastunpack14_8(in,out); - - case 15: - return __fastunpack15_8(in,out); - - case 16: - return __fastunpack16_8(in,out); - - case 17: - return __fastunpack17_8(in,out); - - case 18: - return __fastunpack18_8(in,out); - - case 19: - return __fastunpack19_8(in,out); - - case 20: - return __fastunpack20_8(in,out); - - case 21: - return __fastunpack21_8(in,out); - - case 22: - return __fastunpack22_8(in,out); - - case 23: - return __fastunpack23_8(in,out); - - case 24: - return __fastunpack24_8(in,out); - - case 25: - return __fastunpack25_8(in,out); - - case 26: - return __fastunpack26_8(in,out); - - case 27: - return __fastunpack27_8(in,out); - - case 28: - return __fastunpack28_8(in,out); - - case 29: - return __fastunpack29_8(in,out); - - case 30: - return __fastunpack30_8(in,out); - - case 31: - return __fastunpack31_8(in,out); - - case 32: - return __fastunpack32_8(in,out); - - default: - break; - } - //throw logic_error("number of bits is unsupported"); - } - - - - /*assumes that integers fit in the prescribed number of bits*/ - uint32_t * fastpackwithoutmask_8(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit) { - switch(bit) { - case 0: return nullpacker(in,out); - - case 1: - return __fastpackwithoutmask1_8(in,out); - - case 2: - return __fastpackwithoutmask2_8(in,out); - - case 3: - return __fastpackwithoutmask3_8(in,out); - - case 4: - return __fastpackwithoutmask4_8(in,out); - - case 5: - return __fastpackwithoutmask5_8(in,out); - - case 6: - return __fastpackwithoutmask6_8(in,out); - - case 7: - return __fastpackwithoutmask7_8(in,out); - - case 8: - return __fastpackwithoutmask8_8(in,out); - - case 9: - return __fastpackwithoutmask9_8(in,out); - - case 10: - return __fastpackwithoutmask10_8(in,out); - - case 11: - return __fastpackwithoutmask11_8(in,out); - - case 12: - return __fastpackwithoutmask12_8(in,out); - - case 13: - return __fastpackwithoutmask13_8(in,out); - - case 14: - return __fastpackwithoutmask14_8(in,out); - - case 15: - return __fastpackwithoutmask15_8(in,out); - - case 16: - return __fastpackwithoutmask16_8(in,out); - - case 17: - return __fastpackwithoutmask17_8(in,out); - - case 18: - return __fastpackwithoutmask18_8(in,out); - - case 19: - return __fastpackwithoutmask19_8(in,out); - - case 20: - return __fastpackwithoutmask20_8(in,out); - - case 21: - return __fastpackwithoutmask21_8(in,out); - - case 22: - return __fastpackwithoutmask22_8(in,out); - - case 23: - return __fastpackwithoutmask23_8(in,out); - - case 24: - return __fastpackwithoutmask24_8(in,out); - - case 25: - return __fastpackwithoutmask25_8(in,out); - - case 26: - return __fastpackwithoutmask26_8(in,out); - - case 27: - return __fastpackwithoutmask27_8(in,out); - - case 28: - return __fastpackwithoutmask28_8(in,out); - - case 29: - return __fastpackwithoutmask29_8(in,out); - - case 30: - return __fastpackwithoutmask30_8(in,out); - - case 31: - return __fastpackwithoutmask31_8(in,out); - - case 32: - return __fastpackwithoutmask32_8(in,out); - - default: - break; - } - //throw logic_error("number of bits is unsupported"); - } - - - const uint32_t * nullunpacker16(const uint32_t * __restrict in, uint32_t * __restrict out) { - memset(out,0,16 * 4); - return in; - } - - - uint32_t * __fastpackwithoutmask1_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 5 ; - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 7 ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 9 ; - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 11 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 13 ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 15 ; - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask2_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask3_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 9 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 15 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++in; - *out |= ( (*in) ) << 21 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++in; - *out |= ( (*in) ) << 27 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 3 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 7 ; - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 13 ; - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask4_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask5_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 5 ; - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 15 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 25 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 5 - 3 ); - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 13 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++in; - *out |= ( (*in) ) << 23 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 5 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 11 ; - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask6_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 6 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 6 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask7_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 7 ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 21 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 7 - 3 ); - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 17 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 7 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 13 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - *out = ( (*in) ) >> ( 7 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 9 ; - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask8_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask9_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 9 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - *out = ( (*in) ) >> ( 9 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 13 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 9 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 17 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 9 - 3 ); - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 21 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 9 - 7 ); - ++in; - *out |= ( (*in) ) << 7 ; - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask10_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 10 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 10 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 10 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 10 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask11_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 11 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 11 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 23 ; - ++out; - *out = ( (*in) ) >> ( 11 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 13 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 11 - 3 ); - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 25 ; - ++out; - *out = ( (*in) ) >> ( 11 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 15 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 11 - 5 ); - ++in; - *out |= ( (*in) ) << 5 ; - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask12_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 12 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 12 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 12 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 12 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask13_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 13 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 13 - 7 ); - ++in; - *out |= ( (*in) ) << 7 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 13 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - *out = ( (*in) ) >> ( 13 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 21 ; - ++out; - *out = ( (*in) ) >> ( 13 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 15 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 13 - 9 ); - ++in; - *out |= ( (*in) ) << 9 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 13 - 3 ); - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask14_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 14 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 14 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 14 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 14 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 14 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 14 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask15_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 15 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 15 - 13 ); - ++in; - *out |= ( (*in) ) << 13 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 15 - 11 ); - ++in; - *out |= ( (*in) ) << 11 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 15 - 9 ); - ++in; - *out |= ( (*in) ) << 9 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 15 - 7 ); - ++in; - *out |= ( (*in) ) << 7 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 15 - 5 ); - ++in; - *out |= ( (*in) ) << 5 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 15 - 3 ); - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 15 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask16_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask17_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 17 ; - ++out; - *out = ( (*in) ) >> ( 17 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 19 ; - ++out; - *out = ( (*in) ) >> ( 17 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 21 ; - ++out; - *out = ( (*in) ) >> ( 17 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 23 ; - ++out; - *out = ( (*in) ) >> ( 17 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 25 ; - ++out; - *out = ( (*in) ) >> ( 17 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - *out = ( (*in) ) >> ( 17 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 17 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 17 - 16 ); - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask18_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 18 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 18 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 18 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 18 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 18 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 18 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 18 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 18 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask19_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 19 ; - ++out; - *out = ( (*in) ) >> ( 19 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 25 ; - ++out; - *out = ( (*in) ) >> ( 19 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 19 - 18 ); - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 19 - 5 ); - ++in; - *out |= ( (*in) ) << 5 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 19 - 11 ); - ++in; - *out |= ( (*in) ) << 11 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 19 - 17 ); - ++in; - *out |= ( (*in) ) << 17 ; - ++out; - *out = ( (*in) ) >> ( 19 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 23 ; - ++out; - *out = ( (*in) ) >> ( 19 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 19 - 16 ); - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask20_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 20 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 20 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 20 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 20 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 20 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 20 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 20 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 20 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask21_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 21 ; - ++out; - *out = ( (*in) ) >> ( 21 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 21 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 21 - 9 ); - ++in; - *out |= ( (*in) ) << 9 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 21 - 19 ); - ++in; - *out |= ( (*in) ) << 19 ; - ++out; - *out = ( (*in) ) >> ( 21 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 21 - 18 ); - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 21 - 7 ); - ++in; - *out |= ( (*in) ) << 7 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 21 - 17 ); - ++in; - *out |= ( (*in) ) << 17 ; - ++out; - *out = ( (*in) ) >> ( 21 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - *out = ( (*in) ) >> ( 21 - 16 ); - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask22_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 22 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 22 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 22 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - *out = ( (*in) ) >> ( 22 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 22 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 22 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 22 - 18 ); - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 22 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 22 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 22 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask23_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 23 ; - ++out; - *out = ( (*in) ) >> ( 23 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - *out = ( (*in) ) >> ( 23 - 5 ); - ++in; - *out |= ( (*in) ) << 5 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 23 - 19 ); - ++in; - *out |= ( (*in) ) << 19 ; - ++out; - *out = ( (*in) ) >> ( 23 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++out; - *out = ( (*in) ) >> ( 23 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 23 - 15 ); - ++in; - *out |= ( (*in) ) << 15 ; - ++out; - *out = ( (*in) ) >> ( 23 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 23 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 23 - 11 ); - ++in; - *out |= ( (*in) ) << 11 ; - ++out; - *out = ( (*in) ) >> ( 23 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 25 ; - ++out; - *out = ( (*in) ) >> ( 23 - 16 ); - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask24_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 24 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 24 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 24 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 24 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 24 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 24 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 24 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 24 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask25_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 25 ; - ++out; - *out = ( (*in) ) >> ( 25 - 18 ); - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 25 - 11 ); - ++in; - *out |= ( (*in) ) << 11 ; - ++out; - *out = ( (*in) ) >> ( 25 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 25 - 22 ); - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 25 - 15 ); - ++in; - *out |= ( (*in) ) << 15 ; - ++out; - *out = ( (*in) ) >> ( 25 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - *out = ( (*in) ) >> ( 25 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 25 - 19 ); - ++in; - *out |= ( (*in) ) << 19 ; - ++out; - *out = ( (*in) ) >> ( 25 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 25 - 5 ); - ++in; - *out |= ( (*in) ) << 5 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 25 - 23 ); - ++in; - *out |= ( (*in) ) << 23 ; - ++out; - *out = ( (*in) ) >> ( 25 - 16 ); - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask26_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 26 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 26 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - *out = ( (*in) ) >> ( 26 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - *out = ( (*in) ) >> ( 26 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 26 - 22 ); - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 26 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 26 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++out; - *out = ( (*in) ) >> ( 26 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 26 - 24 ); - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 26 - 18 ); - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 26 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 26 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask27_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - *out = ( (*in) ) >> ( 27 - 22 ); - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 27 - 17 ); - ++in; - *out |= ( (*in) ) << 17 ; - ++out; - *out = ( (*in) ) >> ( 27 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 27 - 7 ); - ++in; - *out |= ( (*in) ) << 7 ; - ++out; - *out = ( (*in) ) >> ( 27 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 27 - 24 ); - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 27 - 19 ); - ++in; - *out |= ( (*in) ) << 19 ; - ++out; - *out = ( (*in) ) >> ( 27 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - *out = ( (*in) ) >> ( 27 - 9 ); - ++in; - *out |= ( (*in) ) << 9 ; - ++out; - *out = ( (*in) ) >> ( 27 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 27 - 26 ); - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 27 - 21 ); - ++in; - *out |= ( (*in) ) << 21 ; - ++out; - *out = ( (*in) ) >> ( 27 - 16 ); - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask28_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 28 - 24 ); - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 28 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 28 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 28 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 28 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - *out = ( (*in) ) >> ( 28 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 28 - 24 ); - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 28 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 28 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 28 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 28 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - *out = ( (*in) ) >> ( 28 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask29_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 29 - 26 ); - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 29 - 23 ); - ++in; - *out |= ( (*in) ) << 23 ; - ++out; - *out = ( (*in) ) >> ( 29 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 29 - 17 ); - ++in; - *out |= ( (*in) ) << 17 ; - ++out; - *out = ( (*in) ) >> ( 29 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - *out = ( (*in) ) >> ( 29 - 11 ); - ++in; - *out |= ( (*in) ) << 11 ; - ++out; - *out = ( (*in) ) >> ( 29 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - *out = ( (*in) ) >> ( 29 - 5 ); - ++in; - *out |= ( (*in) ) << 5 ; - ++out; - *out = ( (*in) ) >> ( 29 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 29 - 28 ); - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 29 - 25 ); - ++in; - *out |= ( (*in) ) << 25 ; - ++out; - *out = ( (*in) ) >> ( 29 - 22 ); - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 29 - 19 ); - ++in; - *out |= ( (*in) ) << 19 ; - ++out; - *out = ( (*in) ) >> ( 29 - 16 ); - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask30_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 30 - 28 ); - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 30 - 26 ); - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 30 - 24 ); - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 30 - 22 ); - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 30 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 30 - 18 ); - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 30 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 30 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - *out = ( (*in) ) >> ( 30 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 30 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++out; - *out = ( (*in) ) >> ( 30 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - *out = ( (*in) ) >> ( 30 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++out; - *out = ( (*in) ) >> ( 30 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++out; - *out = ( (*in) ) >> ( 30 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask31_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 31 - 30 ); - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 31 - 29 ); - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 31 - 28 ); - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 31 - 27 ); - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - *out = ( (*in) ) >> ( 31 - 26 ); - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 31 - 25 ); - ++in; - *out |= ( (*in) ) << 25 ; - ++out; - *out = ( (*in) ) >> ( 31 - 24 ); - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 31 - 23 ); - ++in; - *out |= ( (*in) ) << 23 ; - ++out; - *out = ( (*in) ) >> ( 31 - 22 ); - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 31 - 21 ); - ++in; - *out |= ( (*in) ) << 21 ; - ++out; - *out = ( (*in) ) >> ( 31 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 31 - 19 ); - ++in; - *out |= ( (*in) ) << 19 ; - ++out; - *out = ( (*in) ) >> ( 31 - 18 ); - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 31 - 17 ); - ++in; - *out |= ( (*in) ) << 17 ; - ++out; - *out = ( (*in) ) >> ( 31 - 16 ); - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask32_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - - return out; - } - - - - -const uint32_t * __fastunpack1_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) & 1 ; - out++; - *out = ( (*in) >> 1 ) & 1 ; - out++; - *out = ( (*in) >> 2 ) & 1 ; - out++; - *out = ( (*in) >> 3 ) & 1 ; - out++; - *out = ( (*in) >> 4 ) & 1 ; - out++; - *out = ( (*in) >> 5 ) & 1 ; - out++; - *out = ( (*in) >> 6 ) & 1 ; - out++; - *out = ( (*in) >> 7 ) & 1 ; - out++; - *out = ( (*in) >> 8 ) & 1 ; - out++; - *out = ( (*in) >> 9 ) & 1 ; - out++; - *out = ( (*in) >> 10 ) & 1 ; - out++; - *out = ( (*in) >> 11 ) & 1 ; - out++; - *out = ( (*in) >> 12 ) & 1 ; - out++; - *out = ( (*in) >> 13 ) & 1 ; - out++; - *out = ( (*in) >> 14 ) & 1 ; - out++; - *out = ( (*in) >> 15 ) & 1 ; - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack2_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 2 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 4 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 6 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 8 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 10 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 12 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 14 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 16 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 18 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 20 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 22 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 24 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 26 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 28 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack3_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 3 ) ; - out++; - *out = ( (*in) >> 3 ) % (1U << 3 ) ; - out++; - *out = ( (*in) >> 6 ) % (1U << 3 ) ; - out++; - *out = ( (*in) >> 9 ) % (1U << 3 ) ; - out++; - *out = ( (*in) >> 12 ) % (1U << 3 ) ; - out++; - *out = ( (*in) >> 15 ) % (1U << 3 ) ; - out++; - *out = ( (*in) >> 18 ) % (1U << 3 ) ; - out++; - *out = ( (*in) >> 21 ) % (1U << 3 ) ; - out++; - *out = ( (*in) >> 24 ) % (1U << 3 ) ; - out++; - *out = ( (*in) >> 27 ) % (1U << 3 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 3 - 1 ); - out++; - *out = ( (*in) >> 1 ) % (1U << 3 ) ; - out++; - *out = ( (*in) >> 4 ) % (1U << 3 ) ; - out++; - *out = ( (*in) >> 7 ) % (1U << 3 ) ; - out++; - *out = ( (*in) >> 10 ) % (1U << 3 ) ; - out++; - *out = ( (*in) >> 13 ) % (1U << 3 ) ; - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack4_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 4 ) ; - out++; - *out = ( (*in) >> 4 ) % (1U << 4 ) ; - out++; - *out = ( (*in) >> 8 ) % (1U << 4 ) ; - out++; - *out = ( (*in) >> 12 ) % (1U << 4 ) ; - out++; - *out = ( (*in) >> 16 ) % (1U << 4 ) ; - out++; - *out = ( (*in) >> 20 ) % (1U << 4 ) ; - out++; - *out = ( (*in) >> 24 ) % (1U << 4 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 4 ) ; - out++; - *out = ( (*in) >> 4 ) % (1U << 4 ) ; - out++; - *out = ( (*in) >> 8 ) % (1U << 4 ) ; - out++; - *out = ( (*in) >> 12 ) % (1U << 4 ) ; - out++; - *out = ( (*in) >> 16 ) % (1U << 4 ) ; - out++; - *out = ( (*in) >> 20 ) % (1U << 4 ) ; - out++; - *out = ( (*in) >> 24 ) % (1U << 4 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack5_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 5 ) ; - out++; - *out = ( (*in) >> 5 ) % (1U << 5 ) ; - out++; - *out = ( (*in) >> 10 ) % (1U << 5 ) ; - out++; - *out = ( (*in) >> 15 ) % (1U << 5 ) ; - out++; - *out = ( (*in) >> 20 ) % (1U << 5 ) ; - out++; - *out = ( (*in) >> 25 ) % (1U << 5 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 5 - 3 ); - out++; - *out = ( (*in) >> 3 ) % (1U << 5 ) ; - out++; - *out = ( (*in) >> 8 ) % (1U << 5 ) ; - out++; - *out = ( (*in) >> 13 ) % (1U << 5 ) ; - out++; - *out = ( (*in) >> 18 ) % (1U << 5 ) ; - out++; - *out = ( (*in) >> 23 ) % (1U << 5 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 5 - 1 ); - out++; - *out = ( (*in) >> 1 ) % (1U << 5 ) ; - out++; - *out = ( (*in) >> 6 ) % (1U << 5 ) ; - out++; - *out = ( (*in) >> 11 ) % (1U << 5 ) ; - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack6_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 6 ) ; - out++; - *out = ( (*in) >> 6 ) % (1U << 6 ) ; - out++; - *out = ( (*in) >> 12 ) % (1U << 6 ) ; - out++; - *out = ( (*in) >> 18 ) % (1U << 6 ) ; - out++; - *out = ( (*in) >> 24 ) % (1U << 6 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 6 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 6 ) ; - out++; - *out = ( (*in) >> 10 ) % (1U << 6 ) ; - out++; - *out = ( (*in) >> 16 ) % (1U << 6 ) ; - out++; - *out = ( (*in) >> 22 ) % (1U << 6 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 6 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 6 ) ; - out++; - *out = ( (*in) >> 8 ) % (1U << 6 ) ; - out++; - *out = ( (*in) >> 14 ) % (1U << 6 ) ; - out++; - *out = ( (*in) >> 20 ) % (1U << 6 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack7_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 7 ) ; - out++; - *out = ( (*in) >> 7 ) % (1U << 7 ) ; - out++; - *out = ( (*in) >> 14 ) % (1U << 7 ) ; - out++; - *out = ( (*in) >> 21 ) % (1U << 7 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 7 - 3 ); - out++; - *out = ( (*in) >> 3 ) % (1U << 7 ) ; - out++; - *out = ( (*in) >> 10 ) % (1U << 7 ) ; - out++; - *out = ( (*in) >> 17 ) % (1U << 7 ) ; - out++; - *out = ( (*in) >> 24 ) % (1U << 7 ) ; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 7 - 6 ); - out++; - *out = ( (*in) >> 6 ) % (1U << 7 ) ; - out++; - *out = ( (*in) >> 13 ) % (1U << 7 ) ; - out++; - *out = ( (*in) >> 20 ) % (1U << 7 ) ; - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 7 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 7 ) ; - out++; - *out = ( (*in) >> 9 ) % (1U << 7 ) ; - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack8_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 8 ) ; - out++; - *out = ( (*in) >> 8 ) % (1U << 8 ) ; - out++; - *out = ( (*in) >> 16 ) % (1U << 8 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 8 ) ; - out++; - *out = ( (*in) >> 8 ) % (1U << 8 ) ; - out++; - *out = ( (*in) >> 16 ) % (1U << 8 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 8 ) ; - out++; - *out = ( (*in) >> 8 ) % (1U << 8 ) ; - out++; - *out = ( (*in) >> 16 ) % (1U << 8 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 8 ) ; - out++; - *out = ( (*in) >> 8 ) % (1U << 8 ) ; - out++; - *out = ( (*in) >> 16 ) % (1U << 8 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack9_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 9 ) ; - out++; - *out = ( (*in) >> 9 ) % (1U << 9 ) ; - out++; - *out = ( (*in) >> 18 ) % (1U << 9 ) ; - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 9 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 9 ) ; - out++; - *out = ( (*in) >> 13 ) % (1U << 9 ) ; - out++; - *out = ( (*in) >> 22 ) % (1U << 9 ) ; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 9 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 9 ) ; - out++; - *out = ( (*in) >> 17 ) % (1U << 9 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 9 - 3 ); - out++; - *out = ( (*in) >> 3 ) % (1U << 9 ) ; - out++; - *out = ( (*in) >> 12 ) % (1U << 9 ) ; - out++; - *out = ( (*in) >> 21 ) % (1U << 9 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 7 ))<<( 9 - 7 ); - out++; - *out = ( (*in) >> 7 ) % (1U << 9 ) ; - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack10_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 10 ) ; - out++; - *out = ( (*in) >> 10 ) % (1U << 10 ) ; - out++; - *out = ( (*in) >> 20 ) % (1U << 10 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 10 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 10 ) ; - out++; - *out = ( (*in) >> 18 ) % (1U << 10 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 10 - 6 ); - out++; - *out = ( (*in) >> 6 ) % (1U << 10 ) ; - out++; - *out = ( (*in) >> 16 ) % (1U << 10 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 10 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 10 ) ; - out++; - *out = ( (*in) >> 14 ) % (1U << 10 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 10 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 10 ) ; - out++; - *out = ( (*in) >> 12 ) % (1U << 10 ) ; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack11_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 11 ) ; - out++; - *out = ( (*in) >> 11 ) % (1U << 11 ) ; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 11 - 1 ); - out++; - *out = ( (*in) >> 1 ) % (1U << 11 ) ; - out++; - *out = ( (*in) >> 12 ) % (1U << 11 ) ; - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 11 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 11 ) ; - out++; - *out = ( (*in) >> 13 ) % (1U << 11 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 11 - 3 ); - out++; - *out = ( (*in) >> 3 ) % (1U << 11 ) ; - out++; - *out = ( (*in) >> 14 ) % (1U << 11 ) ; - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 11 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 11 ) ; - out++; - *out = ( (*in) >> 15 ) % (1U << 11 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 11 - 5 ); - out++; - *out = ( (*in) >> 5 ) % (1U << 11 ) ; - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack12_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 12 ) ; - out++; - *out = ( (*in) >> 12 ) % (1U << 12 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 12 ) ; - out++; - *out = ( (*in) >> 16 ) % (1U << 12 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 12 ) ; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 12 ) ; - out++; - *out = ( (*in) >> 12 ) % (1U << 12 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 12 ) ; - out++; - *out = ( (*in) >> 16 ) % (1U << 12 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 12 ) ; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack13_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 13 ) ; - out++; - *out = ( (*in) >> 13 ) % (1U << 13 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 7 ))<<( 13 - 7 ); - out++; - *out = ( (*in) >> 7 ) % (1U << 13 ) ; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 13 - 1 ); - out++; - *out = ( (*in) >> 1 ) % (1U << 13 ) ; - out++; - *out = ( (*in) >> 14 ) % (1U << 13 ) ; - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 13 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 13 ) ; - out++; - *out = ( (*in) >> 21 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 13 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 13 ) ; - out++; - *out = ( (*in) >> 15 ) % (1U << 13 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 9 ))<<( 13 - 9 ); - out++; - *out = ( (*in) >> 9 ) % (1U << 13 ) ; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 13 - 3 ); - out++; - *out = ( (*in) >> 3 ) % (1U << 13 ) ; - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack14_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 14 ) ; - out++; - *out = ( (*in) >> 14 ) % (1U << 14 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 14 - 10 ); - out++; - *out = ( (*in) >> 10 ) % (1U << 14 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 14 - 6 ); - out++; - *out = ( (*in) >> 6 ) % (1U << 14 ) ; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 14 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 14 ) ; - out++; - *out = ( (*in) >> 16 ) % (1U << 14 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 14 - 12 ); - out++; - *out = ( (*in) >> 12 ) % (1U << 14 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 14 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 14 ) ; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 14 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 14 ) ; - out++; - *out = ( (*in) >> 18 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack15_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 15 ) ; - out++; - *out = ( (*in) >> 15 ) % (1U << 15 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 13 ))<<( 15 - 13 ); - out++; - *out = ( (*in) >> 13 ) % (1U << 15 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 11 ))<<( 15 - 11 ); - out++; - *out = ( (*in) >> 11 ) % (1U << 15 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 9 ))<<( 15 - 9 ); - out++; - *out = ( (*in) >> 9 ) % (1U << 15 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 7 ))<<( 15 - 7 ); - out++; - *out = ( (*in) >> 7 ) % (1U << 15 ) ; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 15 - 5 ); - out++; - *out = ( (*in) >> 5 ) % (1U << 15 ) ; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 15 - 3 ); - out++; - *out = ( (*in) >> 3 ) % (1U << 15 ) ; - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 15 - 1 ); - out++; - *out = ( (*in) >> 1 ) % (1U << 15 ) ; - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack16_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack17_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 17 ) ; - out++; - *out = ( (*in) >> 17 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 17 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 17 ) ; - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 17 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 17 ) ; - out++; - *out = ( (*in) >> 21 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 17 - 6 ); - out++; - *out = ( (*in) >> 6 ) % (1U << 17 ) ; - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 17 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 17 ) ; - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 17 - 10 ); - out++; - *out = ( (*in) >> 10 ) % (1U << 17 ) ; - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 17 - 12 ); - out++; - *out = ( (*in) >> 12 ) % (1U << 17 ) ; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 17 - 14 ); - out++; - *out = ( (*in) >> 14 ) % (1U << 17 ) ; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 17 - 16 ); - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack18_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 ); - out++; - *out = ( (*in) >> 12 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 18 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 18 - 6 ); - out++; - *out = ( (*in) >> 6 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 18 - 10 ); - out++; - *out = ( (*in) >> 10 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 18 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack19_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 19 ) ; - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 19 - 6 ); - out++; - *out = ( (*in) >> 6 ) % (1U << 19 ) ; - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 19 - 12 ); - out++; - *out = ( (*in) >> 12 ) % (1U << 19 ) ; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 19 - 18 ); - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 19 - 5 ); - out++; - *out = ( (*in) >> 5 ) % (1U << 19 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 11 ))<<( 19 - 11 ); - out++; - *out = ( (*in) >> 11 ) % (1U << 19 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 17 ))<<( 19 - 17 ); - out++; - *out = ( (*in) >> 17 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 19 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 19 ) ; - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 19 - 10 ); - out++; - *out = ( (*in) >> 10 ) % (1U << 19 ) ; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 19 - 16 ); - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack20_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 20 ) ; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 20 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 20 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 20 ) ; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 20 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 20 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack21_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 21 ) ; - out++; - *out = ( (*in) >> 21 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 21 - 10 ); - out++; - *out = ( (*in) >> 10 ) % (1U << 21 ) ; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 21 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 9 ))<<( 21 - 9 ); - out++; - *out = ( (*in) >> 9 ) % (1U << 21 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 19 ))<<( 21 - 19 ); - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 21 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 21 ) ; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 21 - 18 ); - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 7 ))<<( 21 - 7 ); - out++; - *out = ( (*in) >> 7 ) % (1U << 21 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 17 ))<<( 21 - 17 ); - out++; - *out = ( (*in) >> 17 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 21 - 6 ); - out++; - *out = ( (*in) >> 6 ) % (1U << 21 ) ; - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 21 - 16 ); - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack22_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 22 ) ; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 22 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 22 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 22 - 6 ); - out++; - *out = ( (*in) >> 6 ) % (1U << 22 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 22 - 18 ); - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 22 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 22 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 22 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 22 - 10 ); - out++; - *out = ( (*in) >> 10 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack23_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 23 ) ; - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 23 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 23 - 5 ); - out++; - *out = ( (*in) >> 5 ) % (1U << 23 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 19 ))<<( 23 - 19 ); - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 23 - 10 ); - out++; - *out = ( (*in) >> 10 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 23 - 1 ); - out++; - *out = ( (*in) >> 1 ) % (1U << 23 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 15 ))<<( 23 - 15 ); - out++; - *out = ( (*in) >> 15 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 23 - 6 ); - out++; - *out = ( (*in) >> 6 ) % (1U << 23 ) ; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 23 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 11 ))<<( 23 - 11 ); - out++; - *out = ( (*in) >> 11 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 23 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 23 ) ; - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 23 - 16 ); - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack24_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 24 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 24 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 24 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 24 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack25_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 25 ) ; - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 25 - 18 ); - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 11 ))<<( 25 - 11 ); - out++; - *out = ( (*in) >> 11 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 25 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 25 ) ; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 25 - 22 ); - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 15 ))<<( 25 - 15 ); - out++; - *out = ( (*in) >> 15 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 25 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 25 - 1 ); - out++; - *out = ( (*in) >> 1 ) % (1U << 25 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 19 ))<<( 25 - 19 ); - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 25 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 25 - 5 ); - out++; - *out = ( (*in) >> 5 ) % (1U << 25 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 23 ))<<( 25 - 23 ); - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 25 - 16 ); - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack26_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 26 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 26 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 ); - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 26 - 10 ); - out++; - *out = ( (*in) >> 10 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 26 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 26 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 26 - 24 ); - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 26 - 18 ); - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 26 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 26 - 6 ); - out++; - *out = ( (*in) >> 6 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack27_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 27 ) ; - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 27 - 22 ); - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 17 ))<<( 27 - 17 ); - out++; - *out = ( (*in) >> 17 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 27 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 7 ))<<( 27 - 7 ); - out++; - *out = ( (*in) >> 7 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 27 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 27 ) ; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 27 - 24 ); - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 19 ))<<( 27 - 19 ); - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 27 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 9 ))<<( 27 - 9 ); - out++; - *out = ( (*in) >> 9 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 27 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 27 ) ; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 26 ))<<( 27 - 26 ); - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 21 ))<<( 27 - 21 ); - out++; - *out = ( (*in) >> 21 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 27 - 16 ); - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack28_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 28 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); - out++; - *out = ( (*in) >> 4 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 28 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); - out++; - *out = ( (*in) >> 4 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack29_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 29 ) ; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 26 ))<<( 29 - 26 ); - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 23 ))<<( 29 - 23 ); - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 29 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 17 ))<<( 29 - 17 ); - out++; - *out = ( (*in) >> 17 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 29 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 11 ))<<( 29 - 11 ); - out++; - *out = ( (*in) >> 11 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 29 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 29 - 5 ); - out++; - *out = ( (*in) >> 5 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 29 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 29 ) ; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 28 ))<<( 29 - 28 ); - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 25 ))<<( 29 - 25 ); - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 29 - 22 ); - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 19 ))<<( 29 - 19 ); - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 29 - 16 ); - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack30_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 30 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 ); - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 ); - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 ); - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 ); - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 ); - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 30 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 30 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 30 - 10 ); - out++; - *out = ( (*in) >> 10 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 30 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 30 - 6 ); - out++; - *out = ( (*in) >> 6 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 30 - 4 ); - out++; - *out = ( (*in) >> 4 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 30 - 2 ); - out++; - *out = ( (*in) >> 2 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack31_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 31 ) ; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 30 ))<<( 31 - 30 ); - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 29 ))<<( 31 - 29 ); - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 28 ))<<( 31 - 28 ); - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 27 ))<<( 31 - 27 ); - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 26 ))<<( 31 - 26 ); - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 25 ))<<( 31 - 25 ); - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 31 - 24 ); - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 23 ))<<( 31 - 23 ); - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 31 - 22 ); - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 21 ))<<( 31 - 21 ); - out++; - *out = ( (*in) >> 21 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 31 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 19 ))<<( 31 - 19 ); - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 31 - 18 ); - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 17 ))<<( 31 - 17 ); - out++; - *out = ( (*in) >> 17 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 31 - 16 ); - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack32_16(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - - return in; - } - - - - const uint32_t * fastunpack_16(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit) { - switch(bit) { - case 0: return nullunpacker16(in,out); - - case 1: - return __fastunpack1_16(in,out); - - case 2: - return __fastunpack2_16(in,out); - - case 3: - return __fastunpack3_16(in,out); - - case 4: - return __fastunpack4_16(in,out); - - case 5: - return __fastunpack5_16(in,out); - - case 6: - return __fastunpack6_16(in,out); - - case 7: - return __fastunpack7_16(in,out); - - case 8: - return __fastunpack8_16(in,out); - - case 9: - return __fastunpack9_16(in,out); - - case 10: - return __fastunpack10_16(in,out); - - case 11: - return __fastunpack11_16(in,out); - - case 12: - return __fastunpack12_16(in,out); - - case 13: - return __fastunpack13_16(in,out); - - case 14: - return __fastunpack14_16(in,out); - - case 15: - return __fastunpack15_16(in,out); - - case 16: - return __fastunpack16_16(in,out); - - case 17: - return __fastunpack17_16(in,out); - - case 18: - return __fastunpack18_16(in,out); - - case 19: - return __fastunpack19_16(in,out); - - case 20: - return __fastunpack20_16(in,out); - - case 21: - return __fastunpack21_16(in,out); - - case 22: - return __fastunpack22_16(in,out); - - case 23: - return __fastunpack23_16(in,out); - - case 24: - return __fastunpack24_16(in,out); - - case 25: - return __fastunpack25_16(in,out); - - case 26: - return __fastunpack26_16(in,out); - - case 27: - return __fastunpack27_16(in,out); - - case 28: - return __fastunpack28_16(in,out); - - case 29: - return __fastunpack29_16(in,out); - - case 30: - return __fastunpack30_16(in,out); - - case 31: - return __fastunpack31_16(in,out); - - case 32: - return __fastunpack32_16(in,out); - - default: - break; - } - //throw logic_error("number of bits is unsupported"); - } - - - - /*assumes that integers fit in the prescribed number of bits*/ - uint32_t * fastpackwithoutmask_16(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit) { - switch(bit) { - case 0: return nullpacker(in,out); - - case 1: - return __fastpackwithoutmask1_16(in,out); - - case 2: - return __fastpackwithoutmask2_16(in,out); - - case 3: - return __fastpackwithoutmask3_16(in,out); - - case 4: - return __fastpackwithoutmask4_16(in,out); - - case 5: - return __fastpackwithoutmask5_16(in,out); - - case 6: - return __fastpackwithoutmask6_16(in,out); - - case 7: - return __fastpackwithoutmask7_16(in,out); - - case 8: - return __fastpackwithoutmask8_16(in,out); - - case 9: - return __fastpackwithoutmask9_16(in,out); - - case 10: - return __fastpackwithoutmask10_16(in,out); - - case 11: - return __fastpackwithoutmask11_16(in,out); - - case 12: - return __fastpackwithoutmask12_16(in,out); - - case 13: - return __fastpackwithoutmask13_16(in,out); - - case 14: - return __fastpackwithoutmask14_16(in,out); - - case 15: - return __fastpackwithoutmask15_16(in,out); - - case 16: - return __fastpackwithoutmask16_16(in,out); - - case 17: - return __fastpackwithoutmask17_16(in,out); - - case 18: - return __fastpackwithoutmask18_16(in,out); - - case 19: - return __fastpackwithoutmask19_16(in,out); - - case 20: - return __fastpackwithoutmask20_16(in,out); - - case 21: - return __fastpackwithoutmask21_16(in,out); - - case 22: - return __fastpackwithoutmask22_16(in,out); - - case 23: - return __fastpackwithoutmask23_16(in,out); - - case 24: - return __fastpackwithoutmask24_16(in,out); - - case 25: - return __fastpackwithoutmask25_16(in,out); - - case 26: - return __fastpackwithoutmask26_16(in,out); - - case 27: - return __fastpackwithoutmask27_16(in,out); - - case 28: - return __fastpackwithoutmask28_16(in,out); - - case 29: - return __fastpackwithoutmask29_16(in,out); - - case 30: - return __fastpackwithoutmask30_16(in,out); - - case 31: - return __fastpackwithoutmask31_16(in,out); - - case 32: - return __fastpackwithoutmask32_16(in,out); - - default: - break; - } - //throw logic_error("number of bits is unsupported"); - } - - - const uint32_t * nullunpacker24(const uint32_t * __restrict in, uint32_t * __restrict out) { - memset(out,0,24 * 4); - return in; - } - - - uint32_t * __fastpackwithoutmask1_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 5 ; - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 7 ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 9 ; - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 11 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 13 ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 15 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 17 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++in; - *out |= ( (*in) ) << 19 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 21 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++in; - *out |= ( (*in) ) << 23 ; - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask2_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask3_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 9 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 15 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++in; - *out |= ( (*in) ) << 21 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++in; - *out |= ( (*in) ) << 27 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 3 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 7 ; - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 13 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 19 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++in; - *out |= ( (*in) ) << 25 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 3 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 5 ; - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask4_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask5_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 5 ; - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 15 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 25 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 5 - 3 ); - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 13 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++in; - *out |= ( (*in) ) << 23 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 5 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 11 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 21 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 5 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 9 ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 19 ; - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask6_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 6 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 6 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 6 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask7_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 7 ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 21 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 7 - 3 ); - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 17 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 7 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 13 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - *out = ( (*in) ) >> ( 7 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 9 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 23 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 7 - 5 ); - ++in; - *out |= ( (*in) ) << 5 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 19 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 7 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask8_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask9_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 9 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - *out = ( (*in) ) >> ( 9 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 13 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 9 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 17 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 9 - 3 ); - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 21 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 9 - 7 ); - ++in; - *out |= ( (*in) ) << 7 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 25 ; - ++out; - *out = ( (*in) ) >> ( 9 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 11 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 9 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 15 ; - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask10_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 10 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 10 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 10 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 10 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 10 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 10 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask11_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 11 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 11 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 23 ; - ++out; - *out = ( (*in) ) >> ( 11 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 13 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 11 - 3 ); - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 25 ; - ++out; - *out = ( (*in) ) >> ( 11 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 15 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 11 - 5 ); - ++in; - *out |= ( (*in) ) << 5 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - *out = ( (*in) ) >> ( 11 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 17 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 11 - 7 ); - ++in; - *out |= ( (*in) ) << 7 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 11 - 8 ); - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask12_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 12 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 12 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 12 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 12 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 12 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 12 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask13_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 13 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 13 - 7 ); - ++in; - *out |= ( (*in) ) << 7 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 13 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - *out = ( (*in) ) >> ( 13 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 21 ; - ++out; - *out = ( (*in) ) >> ( 13 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 15 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 13 - 9 ); - ++in; - *out |= ( (*in) ) << 9 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 13 - 3 ); - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 13 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 23 ; - ++out; - *out = ( (*in) ) >> ( 13 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 17 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 13 - 11 ); - ++in; - *out |= ( (*in) ) << 11 ; - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask14_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 14 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 14 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 14 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 14 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 14 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 14 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 14 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 14 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 14 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask15_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 15 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 15 - 13 ); - ++in; - *out |= ( (*in) ) << 13 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 15 - 11 ); - ++in; - *out |= ( (*in) ) << 11 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 15 - 9 ); - ++in; - *out |= ( (*in) ) << 9 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 15 - 7 ); - ++in; - *out |= ( (*in) ) << 7 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 15 - 5 ); - ++in; - *out |= ( (*in) ) << 5 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 15 - 3 ); - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 15 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 15 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 15 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - *out = ( (*in) ) >> ( 15 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 25 ; - ++out; - *out = ( (*in) ) >> ( 15 - 8 ); - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask16_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask17_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 17 ; - ++out; - *out = ( (*in) ) >> ( 17 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 19 ; - ++out; - *out = ( (*in) ) >> ( 17 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 21 ; - ++out; - *out = ( (*in) ) >> ( 17 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 23 ; - ++out; - *out = ( (*in) ) >> ( 17 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 25 ; - ++out; - *out = ( (*in) ) >> ( 17 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - *out = ( (*in) ) >> ( 17 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 17 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 17 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 17 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 17 - 3 ); - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 17 - 5 ); - ++in; - *out |= ( (*in) ) << 5 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 17 - 7 ); - ++in; - *out |= ( (*in) ) << 7 ; - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask18_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 18 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 18 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 18 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 18 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 18 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 18 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 18 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 18 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 18 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 18 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 18 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 18 - 16 ); - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask19_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 19 ; - ++out; - *out = ( (*in) ) >> ( 19 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 25 ; - ++out; - *out = ( (*in) ) >> ( 19 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 19 - 18 ); - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 19 - 5 ); - ++in; - *out |= ( (*in) ) << 5 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 19 - 11 ); - ++in; - *out |= ( (*in) ) << 11 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 19 - 17 ); - ++in; - *out |= ( (*in) ) << 17 ; - ++out; - *out = ( (*in) ) >> ( 19 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 23 ; - ++out; - *out = ( (*in) ) >> ( 19 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 19 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 19 - 3 ); - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 19 - 9 ); - ++in; - *out |= ( (*in) ) << 9 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 19 - 15 ); - ++in; - *out |= ( (*in) ) << 15 ; - ++out; - *out = ( (*in) ) >> ( 19 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 21 ; - ++out; - *out = ( (*in) ) >> ( 19 - 8 ); - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask20_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 20 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 20 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 20 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 20 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 20 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 20 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 20 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 20 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 20 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 20 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 20 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 20 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask21_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 21 ; - ++out; - *out = ( (*in) ) >> ( 21 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 21 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 21 - 9 ); - ++in; - *out |= ( (*in) ) << 9 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 21 - 19 ); - ++in; - *out |= ( (*in) ) << 19 ; - ++out; - *out = ( (*in) ) >> ( 21 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 21 - 18 ); - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 21 - 7 ); - ++in; - *out |= ( (*in) ) << 7 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 21 - 17 ); - ++in; - *out |= ( (*in) ) << 17 ; - ++out; - *out = ( (*in) ) >> ( 21 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - *out = ( (*in) ) >> ( 21 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 21 - 5 ); - ++in; - *out |= ( (*in) ) << 5 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 21 - 15 ); - ++in; - *out |= ( (*in) ) << 15 ; - ++out; - *out = ( (*in) ) >> ( 21 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 25 ; - ++out; - *out = ( (*in) ) >> ( 21 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - *out = ( (*in) ) >> ( 21 - 3 ); - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask22_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 22 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 22 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 22 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - *out = ( (*in) ) >> ( 22 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 22 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 22 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 22 - 18 ); - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 22 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 22 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 22 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 22 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 22 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 22 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - *out = ( (*in) ) >> ( 22 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 22 - 16 ); - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask23_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 23 ; - ++out; - *out = ( (*in) ) >> ( 23 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - *out = ( (*in) ) >> ( 23 - 5 ); - ++in; - *out |= ( (*in) ) << 5 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 23 - 19 ); - ++in; - *out |= ( (*in) ) << 19 ; - ++out; - *out = ( (*in) ) >> ( 23 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++out; - *out = ( (*in) ) >> ( 23 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 23 - 15 ); - ++in; - *out |= ( (*in) ) << 15 ; - ++out; - *out = ( (*in) ) >> ( 23 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 23 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 23 - 11 ); - ++in; - *out |= ( (*in) ) << 11 ; - ++out; - *out = ( (*in) ) >> ( 23 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 25 ; - ++out; - *out = ( (*in) ) >> ( 23 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 23 - 7 ); - ++in; - *out |= ( (*in) ) << 7 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 23 - 21 ); - ++in; - *out |= ( (*in) ) << 21 ; - ++out; - *out = ( (*in) ) >> ( 23 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 23 - 3 ); - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 23 - 17 ); - ++in; - *out |= ( (*in) ) << 17 ; - ++out; - *out = ( (*in) ) >> ( 23 - 8 ); - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask24_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 24 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 24 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 24 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 24 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 24 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 24 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 24 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 24 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 24 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 24 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 24 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 24 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask25_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 25 ; - ++out; - *out = ( (*in) ) >> ( 25 - 18 ); - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 25 - 11 ); - ++in; - *out |= ( (*in) ) << 11 ; - ++out; - *out = ( (*in) ) >> ( 25 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 25 - 22 ); - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 25 - 15 ); - ++in; - *out |= ( (*in) ) << 15 ; - ++out; - *out = ( (*in) ) >> ( 25 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - *out = ( (*in) ) >> ( 25 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 25 - 19 ); - ++in; - *out |= ( (*in) ) << 19 ; - ++out; - *out = ( (*in) ) >> ( 25 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 25 - 5 ); - ++in; - *out |= ( (*in) ) << 5 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 25 - 23 ); - ++in; - *out |= ( (*in) ) << 23 ; - ++out; - *out = ( (*in) ) >> ( 25 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 25 - 9 ); - ++in; - *out |= ( (*in) ) << 9 ; - ++out; - *out = ( (*in) ) >> ( 25 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - *out = ( (*in) ) >> ( 25 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 25 - 13 ); - ++in; - *out |= ( (*in) ) << 13 ; - ++out; - *out = ( (*in) ) >> ( 25 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 25 - 24 ); - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask26_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 26 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 26 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - *out = ( (*in) ) >> ( 26 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - *out = ( (*in) ) >> ( 26 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 26 - 22 ); - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 26 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 26 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++out; - *out = ( (*in) ) >> ( 26 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 26 - 24 ); - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 26 - 18 ); - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 26 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 26 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 26 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 26 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - *out = ( (*in) ) >> ( 26 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - *out = ( (*in) ) >> ( 26 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 26 - 22 ); - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 26 - 16 ); - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask27_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - *out = ( (*in) ) >> ( 27 - 22 ); - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 27 - 17 ); - ++in; - *out |= ( (*in) ) << 17 ; - ++out; - *out = ( (*in) ) >> ( 27 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 27 - 7 ); - ++in; - *out |= ( (*in) ) << 7 ; - ++out; - *out = ( (*in) ) >> ( 27 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 27 - 24 ); - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 27 - 19 ); - ++in; - *out |= ( (*in) ) << 19 ; - ++out; - *out = ( (*in) ) >> ( 27 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - *out = ( (*in) ) >> ( 27 - 9 ); - ++in; - *out |= ( (*in) ) << 9 ; - ++out; - *out = ( (*in) ) >> ( 27 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 27 - 26 ); - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 27 - 21 ); - ++in; - *out |= ( (*in) ) << 21 ; - ++out; - *out = ( (*in) ) >> ( 27 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 27 - 11 ); - ++in; - *out |= ( (*in) ) << 11 ; - ++out; - *out = ( (*in) ) >> ( 27 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++out; - *out = ( (*in) ) >> ( 27 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 27 - 23 ); - ++in; - *out |= ( (*in) ) << 23 ; - ++out; - *out = ( (*in) ) >> ( 27 - 18 ); - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 27 - 13 ); - ++in; - *out |= ( (*in) ) << 13 ; - ++out; - *out = ( (*in) ) >> ( 27 - 8 ); - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask28_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 28 - 24 ); - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 28 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 28 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 28 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 28 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - *out = ( (*in) ) >> ( 28 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 28 - 24 ); - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 28 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 28 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 28 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 28 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - *out = ( (*in) ) >> ( 28 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 28 - 24 ); - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 28 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 28 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 28 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 28 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - *out = ( (*in) ) >> ( 28 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask29_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 29 - 26 ); - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 29 - 23 ); - ++in; - *out |= ( (*in) ) << 23 ; - ++out; - *out = ( (*in) ) >> ( 29 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 29 - 17 ); - ++in; - *out |= ( (*in) ) << 17 ; - ++out; - *out = ( (*in) ) >> ( 29 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - *out = ( (*in) ) >> ( 29 - 11 ); - ++in; - *out |= ( (*in) ) << 11 ; - ++out; - *out = ( (*in) ) >> ( 29 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - *out = ( (*in) ) >> ( 29 - 5 ); - ++in; - *out |= ( (*in) ) << 5 ; - ++out; - *out = ( (*in) ) >> ( 29 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 29 - 28 ); - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 29 - 25 ); - ++in; - *out |= ( (*in) ) << 25 ; - ++out; - *out = ( (*in) ) >> ( 29 - 22 ); - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 29 - 19 ); - ++in; - *out |= ( (*in) ) << 19 ; - ++out; - *out = ( (*in) ) >> ( 29 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 29 - 13 ); - ++in; - *out |= ( (*in) ) << 13 ; - ++out; - *out = ( (*in) ) >> ( 29 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++out; - *out = ( (*in) ) >> ( 29 - 7 ); - ++in; - *out |= ( (*in) ) << 7 ; - ++out; - *out = ( (*in) ) >> ( 29 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++out; - *out = ( (*in) ) >> ( 29 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 29 - 27 ); - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - *out = ( (*in) ) >> ( 29 - 24 ); - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask30_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 30 - 28 ); - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 30 - 26 ); - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 30 - 24 ); - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 30 - 22 ); - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 30 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 30 - 18 ); - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 30 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 30 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - *out = ( (*in) ) >> ( 30 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 30 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++out; - *out = ( (*in) ) >> ( 30 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - *out = ( (*in) ) >> ( 30 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++out; - *out = ( (*in) ) >> ( 30 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++out; - *out = ( (*in) ) >> ( 30 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 30 - 28 ); - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 30 - 26 ); - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 30 - 24 ); - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 30 - 22 ); - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 30 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 30 - 18 ); - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 30 - 16 ); - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask31_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 31 - 30 ); - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 31 - 29 ); - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 31 - 28 ); - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 31 - 27 ); - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - *out = ( (*in) ) >> ( 31 - 26 ); - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 31 - 25 ); - ++in; - *out |= ( (*in) ) << 25 ; - ++out; - *out = ( (*in) ) >> ( 31 - 24 ); - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 31 - 23 ); - ++in; - *out |= ( (*in) ) << 23 ; - ++out; - *out = ( (*in) ) >> ( 31 - 22 ); - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 31 - 21 ); - ++in; - *out |= ( (*in) ) << 21 ; - ++out; - *out = ( (*in) ) >> ( 31 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 31 - 19 ); - ++in; - *out |= ( (*in) ) << 19 ; - ++out; - *out = ( (*in) ) >> ( 31 - 18 ); - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 31 - 17 ); - ++in; - *out |= ( (*in) ) << 17 ; - ++out; - *out = ( (*in) ) >> ( 31 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 31 - 15 ); - ++in; - *out |= ( (*in) ) << 15 ; - ++out; - *out = ( (*in) ) >> ( 31 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - *out = ( (*in) ) >> ( 31 - 13 ); - ++in; - *out |= ( (*in) ) << 13 ; - ++out; - *out = ( (*in) ) >> ( 31 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 31 - 11 ); - ++in; - *out |= ( (*in) ) << 11 ; - ++out; - *out = ( (*in) ) >> ( 31 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++out; - *out = ( (*in) ) >> ( 31 - 9 ); - ++in; - *out |= ( (*in) ) << 9 ; - ++out; - *out = ( (*in) ) >> ( 31 - 8 ); - ++in; - - return out + 1; - } - - - - uint32_t * __fastpackwithoutmask32_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - - return out; - } - - - - -const uint32_t * __fastunpack1_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) & 1 ; - out++; - *out = ( (*in) >> 1 ) & 1 ; - out++; - *out = ( (*in) >> 2 ) & 1 ; - out++; - *out = ( (*in) >> 3 ) & 1 ; - out++; - *out = ( (*in) >> 4 ) & 1 ; - out++; - *out = ( (*in) >> 5 ) & 1 ; - out++; - *out = ( (*in) >> 6 ) & 1 ; - out++; - *out = ( (*in) >> 7 ) & 1 ; - out++; - *out = ( (*in) >> 8 ) & 1 ; - out++; - *out = ( (*in) >> 9 ) & 1 ; - out++; - *out = ( (*in) >> 10 ) & 1 ; - out++; - *out = ( (*in) >> 11 ) & 1 ; - out++; - *out = ( (*in) >> 12 ) & 1 ; - out++; - *out = ( (*in) >> 13 ) & 1 ; - out++; - *out = ( (*in) >> 14 ) & 1 ; - out++; - *out = ( (*in) >> 15 ) & 1 ; - out++; - *out = ( (*in) >> 16 ) & 1 ; - out++; - *out = ( (*in) >> 17 ) & 1 ; - out++; - *out = ( (*in) >> 18 ) & 1 ; - out++; - *out = ( (*in) >> 19 ) & 1 ; - out++; - *out = ( (*in) >> 20 ) & 1 ; - out++; - *out = ( (*in) >> 21 ) & 1 ; - out++; - *out = ( (*in) >> 22 ) & 1 ; - out++; - *out = ( (*in) >> 23 ) & 1 ; - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack2_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 2 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 4 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 6 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 8 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 10 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 12 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 14 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 16 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 18 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 20 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 22 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 24 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 26 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 28 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 2 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 4 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 6 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 8 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 10 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 12 ) % (1U << 2 ) ; - out++; - *out = ( (*in) >> 14 ) % (1U << 2 ) ; - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack3_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 3 ) ; - out++; - *out = ( (*in) >> 3 ) % (1U << 3 ) ; - out++; - *out = ( (*in) >> 6 ) % (1U << 3 ) ; - out++; - *out = ( (*in) >> 9 ) % (1U << 3 ) ; - out++; - *out = ( (*in) >> 12 ) % (1U << 3 ) ; - out++; - *out = ( (*in) >> 15 ) % (1U << 3 ) ; - out++; - *out = ( (*in) >> 18 ) % (1U << 3 ) ; - out++; - *out = ( (*in) >> 21 ) % (1U << 3 ) ; - out++; - *out = ( (*in) >> 24 ) % (1U << 3 ) ; - out++; - *out = ( (*in) >> 27 ) % (1U << 3 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 3 - 1 ); - out++; - *out = ( (*in) >> 1 ) % (1U << 3 ) ; - out++; - *out = ( (*in) >> 4 ) % (1U << 3 ) ; - out++; - *out = ( (*in) >> 7 ) % (1U << 3 ) ; - out++; - *out = ( (*in) >> 10 ) % (1U << 3 ) ; - out++; - *out = ( (*in) >> 13 ) % (1U << 3 ) ; - out++; - *out = ( (*in) >> 16 ) % (1U << 3 ) ; - out++; - *out = ( (*in) >> 19 ) % (1U << 3 ) ; - out++; - *out = ( (*in) >> 22 ) % (1U << 3 ) ; - out++; - *out = ( (*in) >> 25 ) % (1U << 3 ) ; - out++; - *out = ( (*in) >> 28 ) % (1U << 3 ) ; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 3 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 3 ) ; - out++; - *out = ( (*in) >> 5 ) % (1U << 3 ) ; - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack4_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 4 ) ; - out++; - *out = ( (*in) >> 4 ) % (1U << 4 ) ; - out++; - *out = ( (*in) >> 8 ) % (1U << 4 ) ; - out++; - *out = ( (*in) >> 12 ) % (1U << 4 ) ; - out++; - *out = ( (*in) >> 16 ) % (1U << 4 ) ; - out++; - *out = ( (*in) >> 20 ) % (1U << 4 ) ; - out++; - *out = ( (*in) >> 24 ) % (1U << 4 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 4 ) ; - out++; - *out = ( (*in) >> 4 ) % (1U << 4 ) ; - out++; - *out = ( (*in) >> 8 ) % (1U << 4 ) ; - out++; - *out = ( (*in) >> 12 ) % (1U << 4 ) ; - out++; - *out = ( (*in) >> 16 ) % (1U << 4 ) ; - out++; - *out = ( (*in) >> 20 ) % (1U << 4 ) ; - out++; - *out = ( (*in) >> 24 ) % (1U << 4 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 4 ) ; - out++; - *out = ( (*in) >> 4 ) % (1U << 4 ) ; - out++; - *out = ( (*in) >> 8 ) % (1U << 4 ) ; - out++; - *out = ( (*in) >> 12 ) % (1U << 4 ) ; - out++; - *out = ( (*in) >> 16 ) % (1U << 4 ) ; - out++; - *out = ( (*in) >> 20 ) % (1U << 4 ) ; - out++; - *out = ( (*in) >> 24 ) % (1U << 4 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack5_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 5 ) ; - out++; - *out = ( (*in) >> 5 ) % (1U << 5 ) ; - out++; - *out = ( (*in) >> 10 ) % (1U << 5 ) ; - out++; - *out = ( (*in) >> 15 ) % (1U << 5 ) ; - out++; - *out = ( (*in) >> 20 ) % (1U << 5 ) ; - out++; - *out = ( (*in) >> 25 ) % (1U << 5 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 5 - 3 ); - out++; - *out = ( (*in) >> 3 ) % (1U << 5 ) ; - out++; - *out = ( (*in) >> 8 ) % (1U << 5 ) ; - out++; - *out = ( (*in) >> 13 ) % (1U << 5 ) ; - out++; - *out = ( (*in) >> 18 ) % (1U << 5 ) ; - out++; - *out = ( (*in) >> 23 ) % (1U << 5 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 5 - 1 ); - out++; - *out = ( (*in) >> 1 ) % (1U << 5 ) ; - out++; - *out = ( (*in) >> 6 ) % (1U << 5 ) ; - out++; - *out = ( (*in) >> 11 ) % (1U << 5 ) ; - out++; - *out = ( (*in) >> 16 ) % (1U << 5 ) ; - out++; - *out = ( (*in) >> 21 ) % (1U << 5 ) ; - out++; - *out = ( (*in) >> 26 ) % (1U << 5 ) ; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 5 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 5 ) ; - out++; - *out = ( (*in) >> 9 ) % (1U << 5 ) ; - out++; - *out = ( (*in) >> 14 ) % (1U << 5 ) ; - out++; - *out = ( (*in) >> 19 ) % (1U << 5 ) ; - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack6_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 6 ) ; - out++; - *out = ( (*in) >> 6 ) % (1U << 6 ) ; - out++; - *out = ( (*in) >> 12 ) % (1U << 6 ) ; - out++; - *out = ( (*in) >> 18 ) % (1U << 6 ) ; - out++; - *out = ( (*in) >> 24 ) % (1U << 6 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 6 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 6 ) ; - out++; - *out = ( (*in) >> 10 ) % (1U << 6 ) ; - out++; - *out = ( (*in) >> 16 ) % (1U << 6 ) ; - out++; - *out = ( (*in) >> 22 ) % (1U << 6 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 6 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 6 ) ; - out++; - *out = ( (*in) >> 8 ) % (1U << 6 ) ; - out++; - *out = ( (*in) >> 14 ) % (1U << 6 ) ; - out++; - *out = ( (*in) >> 20 ) % (1U << 6 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 6 ) ; - out++; - *out = ( (*in) >> 6 ) % (1U << 6 ) ; - out++; - *out = ( (*in) >> 12 ) % (1U << 6 ) ; - out++; - *out = ( (*in) >> 18 ) % (1U << 6 ) ; - out++; - *out = ( (*in) >> 24 ) % (1U << 6 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 6 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 6 ) ; - out++; - *out = ( (*in) >> 10 ) % (1U << 6 ) ; - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack7_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 7 ) ; - out++; - *out = ( (*in) >> 7 ) % (1U << 7 ) ; - out++; - *out = ( (*in) >> 14 ) % (1U << 7 ) ; - out++; - *out = ( (*in) >> 21 ) % (1U << 7 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 7 - 3 ); - out++; - *out = ( (*in) >> 3 ) % (1U << 7 ) ; - out++; - *out = ( (*in) >> 10 ) % (1U << 7 ) ; - out++; - *out = ( (*in) >> 17 ) % (1U << 7 ) ; - out++; - *out = ( (*in) >> 24 ) % (1U << 7 ) ; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 7 - 6 ); - out++; - *out = ( (*in) >> 6 ) % (1U << 7 ) ; - out++; - *out = ( (*in) >> 13 ) % (1U << 7 ) ; - out++; - *out = ( (*in) >> 20 ) % (1U << 7 ) ; - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 7 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 7 ) ; - out++; - *out = ( (*in) >> 9 ) % (1U << 7 ) ; - out++; - *out = ( (*in) >> 16 ) % (1U << 7 ) ; - out++; - *out = ( (*in) >> 23 ) % (1U << 7 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 7 - 5 ); - out++; - *out = ( (*in) >> 5 ) % (1U << 7 ) ; - out++; - *out = ( (*in) >> 12 ) % (1U << 7 ) ; - out++; - *out = ( (*in) >> 19 ) % (1U << 7 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 7 - 1 ); - out++; - *out = ( (*in) >> 1 ) % (1U << 7 ) ; - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack8_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 8 ) ; - out++; - *out = ( (*in) >> 8 ) % (1U << 8 ) ; - out++; - *out = ( (*in) >> 16 ) % (1U << 8 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 8 ) ; - out++; - *out = ( (*in) >> 8 ) % (1U << 8 ) ; - out++; - *out = ( (*in) >> 16 ) % (1U << 8 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 8 ) ; - out++; - *out = ( (*in) >> 8 ) % (1U << 8 ) ; - out++; - *out = ( (*in) >> 16 ) % (1U << 8 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 8 ) ; - out++; - *out = ( (*in) >> 8 ) % (1U << 8 ) ; - out++; - *out = ( (*in) >> 16 ) % (1U << 8 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 8 ) ; - out++; - *out = ( (*in) >> 8 ) % (1U << 8 ) ; - out++; - *out = ( (*in) >> 16 ) % (1U << 8 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 8 ) ; - out++; - *out = ( (*in) >> 8 ) % (1U << 8 ) ; - out++; - *out = ( (*in) >> 16 ) % (1U << 8 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack9_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 9 ) ; - out++; - *out = ( (*in) >> 9 ) % (1U << 9 ) ; - out++; - *out = ( (*in) >> 18 ) % (1U << 9 ) ; - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 9 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 9 ) ; - out++; - *out = ( (*in) >> 13 ) % (1U << 9 ) ; - out++; - *out = ( (*in) >> 22 ) % (1U << 9 ) ; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 9 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 9 ) ; - out++; - *out = ( (*in) >> 17 ) % (1U << 9 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 9 - 3 ); - out++; - *out = ( (*in) >> 3 ) % (1U << 9 ) ; - out++; - *out = ( (*in) >> 12 ) % (1U << 9 ) ; - out++; - *out = ( (*in) >> 21 ) % (1U << 9 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 7 ))<<( 9 - 7 ); - out++; - *out = ( (*in) >> 7 ) % (1U << 9 ) ; - out++; - *out = ( (*in) >> 16 ) % (1U << 9 ) ; - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 9 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 9 ) ; - out++; - *out = ( (*in) >> 11 ) % (1U << 9 ) ; - out++; - *out = ( (*in) >> 20 ) % (1U << 9 ) ; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 9 - 6 ); - out++; - *out = ( (*in) >> 6 ) % (1U << 9 ) ; - out++; - *out = ( (*in) >> 15 ) % (1U << 9 ) ; - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack10_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 10 ) ; - out++; - *out = ( (*in) >> 10 ) % (1U << 10 ) ; - out++; - *out = ( (*in) >> 20 ) % (1U << 10 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 10 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 10 ) ; - out++; - *out = ( (*in) >> 18 ) % (1U << 10 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 10 - 6 ); - out++; - *out = ( (*in) >> 6 ) % (1U << 10 ) ; - out++; - *out = ( (*in) >> 16 ) % (1U << 10 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 10 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 10 ) ; - out++; - *out = ( (*in) >> 14 ) % (1U << 10 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 10 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 10 ) ; - out++; - *out = ( (*in) >> 12 ) % (1U << 10 ) ; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 10 ) ; - out++; - *out = ( (*in) >> 10 ) % (1U << 10 ) ; - out++; - *out = ( (*in) >> 20 ) % (1U << 10 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 10 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 10 ) ; - out++; - *out = ( (*in) >> 18 ) % (1U << 10 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 10 - 6 ); - out++; - *out = ( (*in) >> 6 ) % (1U << 10 ) ; - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack11_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 11 ) ; - out++; - *out = ( (*in) >> 11 ) % (1U << 11 ) ; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 11 - 1 ); - out++; - *out = ( (*in) >> 1 ) % (1U << 11 ) ; - out++; - *out = ( (*in) >> 12 ) % (1U << 11 ) ; - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 11 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 11 ) ; - out++; - *out = ( (*in) >> 13 ) % (1U << 11 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 11 - 3 ); - out++; - *out = ( (*in) >> 3 ) % (1U << 11 ) ; - out++; - *out = ( (*in) >> 14 ) % (1U << 11 ) ; - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 11 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 11 ) ; - out++; - *out = ( (*in) >> 15 ) % (1U << 11 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 11 - 5 ); - out++; - *out = ( (*in) >> 5 ) % (1U << 11 ) ; - out++; - *out = ( (*in) >> 16 ) % (1U << 11 ) ; - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 11 - 6 ); - out++; - *out = ( (*in) >> 6 ) % (1U << 11 ) ; - out++; - *out = ( (*in) >> 17 ) % (1U << 11 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 7 ))<<( 11 - 7 ); - out++; - *out = ( (*in) >> 7 ) % (1U << 11 ) ; - out++; - *out = ( (*in) >> 18 ) % (1U << 11 ) ; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 11 - 8 ); - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack12_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 12 ) ; - out++; - *out = ( (*in) >> 12 ) % (1U << 12 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 12 ) ; - out++; - *out = ( (*in) >> 16 ) % (1U << 12 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 12 ) ; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 12 ) ; - out++; - *out = ( (*in) >> 12 ) % (1U << 12 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 12 ) ; - out++; - *out = ( (*in) >> 16 ) % (1U << 12 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 12 ) ; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 12 ) ; - out++; - *out = ( (*in) >> 12 ) % (1U << 12 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 12 ) ; - out++; - *out = ( (*in) >> 16 ) % (1U << 12 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 12 ) ; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack13_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 13 ) ; - out++; - *out = ( (*in) >> 13 ) % (1U << 13 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 7 ))<<( 13 - 7 ); - out++; - *out = ( (*in) >> 7 ) % (1U << 13 ) ; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 13 - 1 ); - out++; - *out = ( (*in) >> 1 ) % (1U << 13 ) ; - out++; - *out = ( (*in) >> 14 ) % (1U << 13 ) ; - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 13 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 13 ) ; - out++; - *out = ( (*in) >> 21 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 13 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 13 ) ; - out++; - *out = ( (*in) >> 15 ) % (1U << 13 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 9 ))<<( 13 - 9 ); - out++; - *out = ( (*in) >> 9 ) % (1U << 13 ) ; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 13 - 3 ); - out++; - *out = ( (*in) >> 3 ) % (1U << 13 ) ; - out++; - *out = ( (*in) >> 16 ) % (1U << 13 ) ; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 13 - 10 ); - out++; - *out = ( (*in) >> 10 ) % (1U << 13 ) ; - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 13 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 13 ) ; - out++; - *out = ( (*in) >> 17 ) % (1U << 13 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 11 ))<<( 13 - 11 ); - out++; - *out = ( (*in) >> 11 ) % (1U << 13 ) ; - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack14_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 14 ) ; - out++; - *out = ( (*in) >> 14 ) % (1U << 14 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 14 - 10 ); - out++; - *out = ( (*in) >> 10 ) % (1U << 14 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 14 - 6 ); - out++; - *out = ( (*in) >> 6 ) % (1U << 14 ) ; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 14 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 14 ) ; - out++; - *out = ( (*in) >> 16 ) % (1U << 14 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 14 - 12 ); - out++; - *out = ( (*in) >> 12 ) % (1U << 14 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 14 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 14 ) ; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 14 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 14 ) ; - out++; - *out = ( (*in) >> 18 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 14 ) ; - out++; - *out = ( (*in) >> 14 ) % (1U << 14 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 14 - 10 ); - out++; - *out = ( (*in) >> 10 ) % (1U << 14 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 14 - 6 ); - out++; - *out = ( (*in) >> 6 ) % (1U << 14 ) ; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 14 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 14 ) ; - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack15_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 15 ) ; - out++; - *out = ( (*in) >> 15 ) % (1U << 15 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 13 ))<<( 15 - 13 ); - out++; - *out = ( (*in) >> 13 ) % (1U << 15 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 11 ))<<( 15 - 11 ); - out++; - *out = ( (*in) >> 11 ) % (1U << 15 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 9 ))<<( 15 - 9 ); - out++; - *out = ( (*in) >> 9 ) % (1U << 15 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 7 ))<<( 15 - 7 ); - out++; - *out = ( (*in) >> 7 ) % (1U << 15 ) ; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 15 - 5 ); - out++; - *out = ( (*in) >> 5 ) % (1U << 15 ) ; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 15 - 3 ); - out++; - *out = ( (*in) >> 3 ) % (1U << 15 ) ; - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 15 - 1 ); - out++; - *out = ( (*in) >> 1 ) % (1U << 15 ) ; - out++; - *out = ( (*in) >> 16 ) % (1U << 15 ) ; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 15 - 14 ); - out++; - *out = ( (*in) >> 14 ) % (1U << 15 ) ; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 15 - 12 ); - out++; - *out = ( (*in) >> 12 ) % (1U << 15 ) ; - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 15 - 10 ); - out++; - *out = ( (*in) >> 10 ) % (1U << 15 ) ; - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 15 - 8 ); - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack16_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 16 ) ; - out++; - *out = ( (*in) >> 16 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack17_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 17 ) ; - out++; - *out = ( (*in) >> 17 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 17 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 17 ) ; - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 17 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 17 ) ; - out++; - *out = ( (*in) >> 21 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 17 - 6 ); - out++; - *out = ( (*in) >> 6 ) % (1U << 17 ) ; - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 17 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 17 ) ; - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 17 - 10 ); - out++; - *out = ( (*in) >> 10 ) % (1U << 17 ) ; - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 17 - 12 ); - out++; - *out = ( (*in) >> 12 ) % (1U << 17 ) ; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 17 - 14 ); - out++; - *out = ( (*in) >> 14 ) % (1U << 17 ) ; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 17 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 17 - 1 ); - out++; - *out = ( (*in) >> 1 ) % (1U << 17 ) ; - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 17 - 3 ); - out++; - *out = ( (*in) >> 3 ) % (1U << 17 ) ; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 17 - 5 ); - out++; - *out = ( (*in) >> 5 ) % (1U << 17 ) ; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 7 ))<<( 17 - 7 ); - out++; - *out = ( (*in) >> 7 ) % (1U << 17 ) ; - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack18_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 ); - out++; - *out = ( (*in) >> 12 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 18 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 18 - 6 ); - out++; - *out = ( (*in) >> 6 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 18 - 10 ); - out++; - *out = ( (*in) >> 10 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 18 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 ); - out++; - *out = ( (*in) >> 12 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 ); - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack19_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 19 ) ; - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 19 - 6 ); - out++; - *out = ( (*in) >> 6 ) % (1U << 19 ) ; - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 19 - 12 ); - out++; - *out = ( (*in) >> 12 ) % (1U << 19 ) ; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 19 - 18 ); - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 19 - 5 ); - out++; - *out = ( (*in) >> 5 ) % (1U << 19 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 11 ))<<( 19 - 11 ); - out++; - *out = ( (*in) >> 11 ) % (1U << 19 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 17 ))<<( 19 - 17 ); - out++; - *out = ( (*in) >> 17 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 19 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 19 ) ; - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 19 - 10 ); - out++; - *out = ( (*in) >> 10 ) % (1U << 19 ) ; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 19 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 19 - 3 ); - out++; - *out = ( (*in) >> 3 ) % (1U << 19 ) ; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 9 ))<<( 19 - 9 ); - out++; - *out = ( (*in) >> 9 ) % (1U << 19 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 15 ))<<( 19 - 15 ); - out++; - *out = ( (*in) >> 15 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 19 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 19 ) ; - out++; - *out = ( (*in) >> 21 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 19 - 8 ); - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack20_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 20 ) ; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 20 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 20 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 20 ) ; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 20 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 20 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 20 ) ; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 20 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 20 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack21_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 21 ) ; - out++; - *out = ( (*in) >> 21 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 21 - 10 ); - out++; - *out = ( (*in) >> 10 ) % (1U << 21 ) ; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 21 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 9 ))<<( 21 - 9 ); - out++; - *out = ( (*in) >> 9 ) % (1U << 21 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 19 ))<<( 21 - 19 ); - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 21 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 21 ) ; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 21 - 18 ); - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 7 ))<<( 21 - 7 ); - out++; - *out = ( (*in) >> 7 ) % (1U << 21 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 17 ))<<( 21 - 17 ); - out++; - *out = ( (*in) >> 17 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 21 - 6 ); - out++; - *out = ( (*in) >> 6 ) % (1U << 21 ) ; - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 21 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 21 - 5 ); - out++; - *out = ( (*in) >> 5 ) % (1U << 21 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 15 ))<<( 21 - 15 ); - out++; - *out = ( (*in) >> 15 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 21 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 21 ) ; - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 21 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 21 - 3 ); - out++; - *out = ( (*in) >> 3 ) % (1U << 21 ) ; - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack22_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 22 ) ; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 22 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 22 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 22 - 6 ); - out++; - *out = ( (*in) >> 6 ) % (1U << 22 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 22 - 18 ); - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 22 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 22 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 22 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 22 - 10 ); - out++; - *out = ( (*in) >> 10 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 22 ) ; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 22 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 22 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 ); - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack23_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 23 ) ; - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 23 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 23 - 5 ); - out++; - *out = ( (*in) >> 5 ) % (1U << 23 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 19 ))<<( 23 - 19 ); - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 23 - 10 ); - out++; - *out = ( (*in) >> 10 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 23 - 1 ); - out++; - *out = ( (*in) >> 1 ) % (1U << 23 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 15 ))<<( 23 - 15 ); - out++; - *out = ( (*in) >> 15 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 23 - 6 ); - out++; - *out = ( (*in) >> 6 ) % (1U << 23 ) ; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 23 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 11 ))<<( 23 - 11 ); - out++; - *out = ( (*in) >> 11 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 23 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 23 ) ; - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 23 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 7 ))<<( 23 - 7 ); - out++; - *out = ( (*in) >> 7 ) % (1U << 23 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 21 ))<<( 23 - 21 ); - out++; - *out = ( (*in) >> 21 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 23 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 23 - 3 ); - out++; - *out = ( (*in) >> 3 ) % (1U << 23 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 17 ))<<( 23 - 17 ); - out++; - *out = ( (*in) >> 17 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 23 - 8 ); - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack24_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 24 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 24 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 24 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 24 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 24 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 24 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack25_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 25 ) ; - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 25 - 18 ); - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 11 ))<<( 25 - 11 ); - out++; - *out = ( (*in) >> 11 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 25 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 25 ) ; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 25 - 22 ); - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 15 ))<<( 25 - 15 ); - out++; - *out = ( (*in) >> 15 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 25 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 25 - 1 ); - out++; - *out = ( (*in) >> 1 ) % (1U << 25 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 19 ))<<( 25 - 19 ); - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 25 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 25 - 5 ); - out++; - *out = ( (*in) >> 5 ) % (1U << 25 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 23 ))<<( 25 - 23 ); - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 25 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 9 ))<<( 25 - 9 ); - out++; - *out = ( (*in) >> 9 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 25 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 25 ) ; - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 25 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 13 ))<<( 25 - 13 ); - out++; - *out = ( (*in) >> 13 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 25 - 6 ); - out++; - *out = ( (*in) >> 6 ) % (1U << 25 ) ; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 25 - 24 ); - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack26_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 26 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 26 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 ); - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 26 - 10 ); - out++; - *out = ( (*in) >> 10 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 26 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 26 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 26 - 24 ); - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 26 - 18 ); - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 26 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 26 - 6 ); - out++; - *out = ( (*in) >> 6 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 26 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 26 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 ); - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 ); - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack27_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 27 ) ; - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 27 - 22 ); - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 17 ))<<( 27 - 17 ); - out++; - *out = ( (*in) >> 17 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 27 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 7 ))<<( 27 - 7 ); - out++; - *out = ( (*in) >> 7 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 27 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 27 ) ; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 27 - 24 ); - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 19 ))<<( 27 - 19 ); - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 27 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 9 ))<<( 27 - 9 ); - out++; - *out = ( (*in) >> 9 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 27 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 27 ) ; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 26 ))<<( 27 - 26 ); - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 21 ))<<( 27 - 21 ); - out++; - *out = ( (*in) >> 21 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 27 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 11 ))<<( 27 - 11 ); - out++; - *out = ( (*in) >> 11 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 27 - 6 ); - out++; - *out = ( (*in) >> 6 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 27 - 1 ); - out++; - *out = ( (*in) >> 1 ) % (1U << 27 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 23 ))<<( 27 - 23 ); - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 27 - 18 ); - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 13 ))<<( 27 - 13 ); - out++; - *out = ( (*in) >> 13 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 27 - 8 ); - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack28_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 28 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); - out++; - *out = ( (*in) >> 4 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 28 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); - out++; - *out = ( (*in) >> 4 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 28 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); - out++; - *out = ( (*in) >> 4 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack29_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 29 ) ; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 26 ))<<( 29 - 26 ); - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 23 ))<<( 29 - 23 ); - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 29 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 17 ))<<( 29 - 17 ); - out++; - *out = ( (*in) >> 17 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 29 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 11 ))<<( 29 - 11 ); - out++; - *out = ( (*in) >> 11 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 29 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 29 - 5 ); - out++; - *out = ( (*in) >> 5 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 29 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 29 ) ; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 28 ))<<( 29 - 28 ); - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 25 ))<<( 29 - 25 ); - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 29 - 22 ); - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 19 ))<<( 29 - 19 ); - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 29 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 13 ))<<( 29 - 13 ); - out++; - *out = ( (*in) >> 13 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 29 - 10 ); - out++; - *out = ( (*in) >> 10 ) ; - ++in; - *out |= ((*in) % (1U<< 7 ))<<( 29 - 7 ); - out++; - *out = ( (*in) >> 7 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 29 - 4 ); - out++; - *out = ( (*in) >> 4 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 29 - 1 ); - out++; - *out = ( (*in) >> 1 ) % (1U << 29 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 27 ))<<( 29 - 27 ); - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 29 - 24 ); - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack30_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 30 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 ); - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 ); - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 ); - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 ); - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 ); - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 30 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 30 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 30 - 10 ); - out++; - *out = ( (*in) >> 10 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 30 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 30 - 6 ); - out++; - *out = ( (*in) >> 6 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 30 - 4 ); - out++; - *out = ( (*in) >> 4 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 30 - 2 ); - out++; - *out = ( (*in) >> 2 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 30 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 ); - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 ); - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 ); - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 ); - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 ); - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 ); - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack31_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 31 ) ; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 30 ))<<( 31 - 30 ); - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 29 ))<<( 31 - 29 ); - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 28 ))<<( 31 - 28 ); - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 27 ))<<( 31 - 27 ); - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 26 ))<<( 31 - 26 ); - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 25 ))<<( 31 - 25 ); - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 31 - 24 ); - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 23 ))<<( 31 - 23 ); - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 31 - 22 ); - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 21 ))<<( 31 - 21 ); - out++; - *out = ( (*in) >> 21 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 31 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 19 ))<<( 31 - 19 ); - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 31 - 18 ); - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 17 ))<<( 31 - 17 ); - out++; - *out = ( (*in) >> 17 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 31 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 15 ))<<( 31 - 15 ); - out++; - *out = ( (*in) >> 15 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 31 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 13 ))<<( 31 - 13 ); - out++; - *out = ( (*in) >> 13 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 31 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 11 ))<<( 31 - 11 ); - out++; - *out = ( (*in) >> 11 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 31 - 10 ); - out++; - *out = ( (*in) >> 10 ) ; - ++in; - *out |= ((*in) % (1U<< 9 ))<<( 31 - 9 ); - out++; - *out = ( (*in) >> 9 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 31 - 8 ); - out++; - - return in + 1; - } - - - - -const uint32_t * __fastunpack32_24(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - - return in; - } - - - - const uint32_t * fastunpack_24(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit) { - switch(bit) { - case 0: return nullunpacker24(in,out); - - case 1: - return __fastunpack1_24(in,out); - - case 2: - return __fastunpack2_24(in,out); - - case 3: - return __fastunpack3_24(in,out); - - case 4: - return __fastunpack4_24(in,out); - - case 5: - return __fastunpack5_24(in,out); - - case 6: - return __fastunpack6_24(in,out); - - case 7: - return __fastunpack7_24(in,out); - - case 8: - return __fastunpack8_24(in,out); - - case 9: - return __fastunpack9_24(in,out); - - case 10: - return __fastunpack10_24(in,out); - - case 11: - return __fastunpack11_24(in,out); - - case 12: - return __fastunpack12_24(in,out); - - case 13: - return __fastunpack13_24(in,out); - - case 14: - return __fastunpack14_24(in,out); - - case 15: - return __fastunpack15_24(in,out); - - case 16: - return __fastunpack16_24(in,out); - - case 17: - return __fastunpack17_24(in,out); - - case 18: - return __fastunpack18_24(in,out); - - case 19: - return __fastunpack19_24(in,out); - - case 20: - return __fastunpack20_24(in,out); - - case 21: - return __fastunpack21_24(in,out); - - case 22: - return __fastunpack22_24(in,out); - - case 23: - return __fastunpack23_24(in,out); - - case 24: - return __fastunpack24_24(in,out); - - case 25: - return __fastunpack25_24(in,out); - - case 26: - return __fastunpack26_24(in,out); - - case 27: - return __fastunpack27_24(in,out); - - case 28: - return __fastunpack28_24(in,out); - - case 29: - return __fastunpack29_24(in,out); - - case 30: - return __fastunpack30_24(in,out); - - case 31: - return __fastunpack31_24(in,out); - - case 32: - return __fastunpack32_24(in,out); - - default: - break; - } - //throw logic_error("number of bits is unsupported"); - } - - - - /*assumes that integers fit in the prescribed number of bits*/ - uint32_t * fastpackwithoutmask_24(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit) { - switch(bit) { - case 0: return nullpacker(in,out); - - case 1: - return __fastpackwithoutmask1_24(in,out); - - case 2: - return __fastpackwithoutmask2_24(in,out); - - case 3: - return __fastpackwithoutmask3_24(in,out); - - case 4: - return __fastpackwithoutmask4_24(in,out); - - case 5: - return __fastpackwithoutmask5_24(in,out); - - case 6: - return __fastpackwithoutmask6_24(in,out); - - case 7: - return __fastpackwithoutmask7_24(in,out); - - case 8: - return __fastpackwithoutmask8_24(in,out); - - case 9: - return __fastpackwithoutmask9_24(in,out); - - case 10: - return __fastpackwithoutmask10_24(in,out); - - case 11: - return __fastpackwithoutmask11_24(in,out); - - case 12: - return __fastpackwithoutmask12_24(in,out); - - case 13: - return __fastpackwithoutmask13_24(in,out); - - case 14: - return __fastpackwithoutmask14_24(in,out); - - case 15: - return __fastpackwithoutmask15_24(in,out); - - case 16: - return __fastpackwithoutmask16_24(in,out); - - case 17: - return __fastpackwithoutmask17_24(in,out); - - case 18: - return __fastpackwithoutmask18_24(in,out); - - case 19: - return __fastpackwithoutmask19_24(in,out); - - case 20: - return __fastpackwithoutmask20_24(in,out); - - case 21: - return __fastpackwithoutmask21_24(in,out); - - case 22: - return __fastpackwithoutmask22_24(in,out); - - case 23: - return __fastpackwithoutmask23_24(in,out); - - case 24: - return __fastpackwithoutmask24_24(in,out); - - case 25: - return __fastpackwithoutmask25_24(in,out); - - case 26: - return __fastpackwithoutmask26_24(in,out); - - case 27: - return __fastpackwithoutmask27_24(in,out); - - case 28: - return __fastpackwithoutmask28_24(in,out); - - case 29: - return __fastpackwithoutmask29_24(in,out); - - case 30: - return __fastpackwithoutmask30_24(in,out); - - case 31: - return __fastpackwithoutmask31_24(in,out); - - case 32: - return __fastpackwithoutmask32_24(in,out); - - default: - break; - } - //throw logic_error("number of bits is unsupported"); - } - - - const uint32_t * nullunpacker32(const uint32_t * __restrict in, uint32_t * __restrict out) { - memset(out,0,32 * 4); - return in; - } - - - uint32_t * __fastpackwithoutmask1_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 5 ; - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 7 ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 9 ; - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 11 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 13 ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 15 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 17 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++in; - *out |= ( (*in) ) << 19 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 21 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++in; - *out |= ( (*in) ) << 23 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++in; - *out |= ( (*in) ) << 25 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++in; - *out |= ( (*in) ) << 27 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++in; - *out |= ( (*in) ) << 29 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask2_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask3_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 9 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 15 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++in; - *out |= ( (*in) ) << 21 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++in; - *out |= ( (*in) ) << 27 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 3 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 7 ; - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 13 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 19 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++in; - *out |= ( (*in) ) << 25 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 3 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 5 ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 11 ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 17 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 23 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask4_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask5_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 5 ; - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 15 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 25 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 5 - 3 ); - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 13 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++in; - *out |= ( (*in) ) << 23 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 5 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 11 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 21 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 5 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 9 ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 19 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 5 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 7 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 17 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask6_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 6 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 6 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 6 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 6 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask7_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 7 ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 21 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 7 - 3 ); - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 17 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 7 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 13 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - *out = ( (*in) ) >> ( 7 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 9 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 23 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 7 - 5 ); - ++in; - *out |= ( (*in) ) << 5 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 19 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 7 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 15 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 7 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 11 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++in; - *out |= ( (*in) ) << 25 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask8_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask9_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 9 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - *out = ( (*in) ) >> ( 9 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 13 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 9 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 17 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 9 - 3 ); - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 21 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 9 - 7 ); - ++in; - *out |= ( (*in) ) << 7 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 25 ; - ++out; - *out = ( (*in) ) >> ( 9 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 11 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 9 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 15 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 9 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 19 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 9 - 5 ); - ++in; - *out |= ( (*in) ) << 5 ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 23 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask10_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 10 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 10 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 10 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 10 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 10 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 10 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 10 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 10 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask11_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 11 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 11 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 23 ; - ++out; - *out = ( (*in) ) >> ( 11 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 13 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 11 - 3 ); - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 25 ; - ++out; - *out = ( (*in) ) >> ( 11 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 15 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 11 - 5 ); - ++in; - *out |= ( (*in) ) << 5 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - *out = ( (*in) ) >> ( 11 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 17 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 11 - 7 ); - ++in; - *out |= ( (*in) ) << 7 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 11 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 19 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 11 - 9 ); - ++in; - *out |= ( (*in) ) << 9 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 11 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 21 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask12_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 12 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 12 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 12 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 12 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 12 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 12 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 12 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 12 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask13_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 13 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 13 - 7 ); - ++in; - *out |= ( (*in) ) << 7 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 13 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - *out = ( (*in) ) >> ( 13 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 21 ; - ++out; - *out = ( (*in) ) >> ( 13 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 15 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 13 - 9 ); - ++in; - *out |= ( (*in) ) << 9 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 13 - 3 ); - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 13 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 23 ; - ++out; - *out = ( (*in) ) >> ( 13 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 17 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 13 - 11 ); - ++in; - *out |= ( (*in) ) << 11 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 13 - 5 ); - ++in; - *out |= ( (*in) ) << 5 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 13 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 25 ; - ++out; - *out = ( (*in) ) >> ( 13 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 19 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask14_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 14 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 14 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 14 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 14 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 14 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 14 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 14 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 14 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 14 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 14 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 14 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 14 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask15_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 15 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 15 - 13 ); - ++in; - *out |= ( (*in) ) << 13 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 15 - 11 ); - ++in; - *out |= ( (*in) ) << 11 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 15 - 9 ); - ++in; - *out |= ( (*in) ) << 9 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 15 - 7 ); - ++in; - *out |= ( (*in) ) << 7 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 15 - 5 ); - ++in; - *out |= ( (*in) ) << 5 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 15 - 3 ); - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 15 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - *out |= ( (*in) ) << 16 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 15 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 15 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - *out = ( (*in) ) >> ( 15 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 25 ; - ++out; - *out = ( (*in) ) >> ( 15 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 23 ; - ++out; - *out = ( (*in) ) >> ( 15 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 21 ; - ++out; - *out = ( (*in) ) >> ( 15 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 19 ; - ++out; - *out = ( (*in) ) >> ( 15 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 17 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask16_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask17_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 17 ; - ++out; - *out = ( (*in) ) >> ( 17 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 19 ; - ++out; - *out = ( (*in) ) >> ( 17 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 21 ; - ++out; - *out = ( (*in) ) >> ( 17 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 23 ; - ++out; - *out = ( (*in) ) >> ( 17 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 25 ; - ++out; - *out = ( (*in) ) >> ( 17 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - *out = ( (*in) ) >> ( 17 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 17 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 17 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 17 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 17 - 3 ); - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 17 - 5 ); - ++in; - *out |= ( (*in) ) << 5 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 17 - 7 ); - ++in; - *out |= ( (*in) ) << 7 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 17 - 9 ); - ++in; - *out |= ( (*in) ) << 9 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 17 - 11 ); - ++in; - *out |= ( (*in) ) << 11 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 17 - 13 ); - ++in; - *out |= ( (*in) ) << 13 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 17 - 15 ); - ++in; - *out |= ( (*in) ) << 15 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask18_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 18 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 18 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 18 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 18 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 18 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 18 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 18 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 18 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 18 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 18 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 18 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 18 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 18 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 18 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 18 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 18 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask19_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 19 ; - ++out; - *out = ( (*in) ) >> ( 19 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 25 ; - ++out; - *out = ( (*in) ) >> ( 19 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 19 - 18 ); - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 19 - 5 ); - ++in; - *out |= ( (*in) ) << 5 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 19 - 11 ); - ++in; - *out |= ( (*in) ) << 11 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 19 - 17 ); - ++in; - *out |= ( (*in) ) << 17 ; - ++out; - *out = ( (*in) ) >> ( 19 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 23 ; - ++out; - *out = ( (*in) ) >> ( 19 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 19 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 19 - 3 ); - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 19 - 9 ); - ++in; - *out |= ( (*in) ) << 9 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 19 - 15 ); - ++in; - *out |= ( (*in) ) << 15 ; - ++out; - *out = ( (*in) ) >> ( 19 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 21 ; - ++out; - *out = ( (*in) ) >> ( 19 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - *out = ( (*in) ) >> ( 19 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - *out = ( (*in) ) >> ( 19 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 19 - 7 ); - ++in; - *out |= ( (*in) ) << 7 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 19 - 13 ); - ++in; - *out |= ( (*in) ) << 13 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask20_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 20 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 20 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 20 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 20 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 20 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 20 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 20 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 20 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 20 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 20 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 20 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 20 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 20 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 20 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 20 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 20 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask21_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 21 ; - ++out; - *out = ( (*in) ) >> ( 21 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 21 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 21 - 9 ); - ++in; - *out |= ( (*in) ) << 9 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 21 - 19 ); - ++in; - *out |= ( (*in) ) << 19 ; - ++out; - *out = ( (*in) ) >> ( 21 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 21 - 18 ); - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 21 - 7 ); - ++in; - *out |= ( (*in) ) << 7 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 21 - 17 ); - ++in; - *out |= ( (*in) ) << 17 ; - ++out; - *out = ( (*in) ) >> ( 21 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - *out = ( (*in) ) >> ( 21 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 21 - 5 ); - ++in; - *out |= ( (*in) ) << 5 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 21 - 15 ); - ++in; - *out |= ( (*in) ) << 15 ; - ++out; - *out = ( (*in) ) >> ( 21 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 25 ; - ++out; - *out = ( (*in) ) >> ( 21 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - *out = ( (*in) ) >> ( 21 - 3 ); - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 21 - 13 ); - ++in; - *out |= ( (*in) ) << 13 ; - ++out; - *out = ( (*in) ) >> ( 21 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 23 ; - ++out; - *out = ( (*in) ) >> ( 21 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 21 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 21 - 11 ); - ++in; - *out |= ( (*in) ) << 11 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask22_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 22 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 22 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 22 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - *out = ( (*in) ) >> ( 22 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 22 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 22 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 22 - 18 ); - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 22 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 22 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 22 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 22 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 22 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 22 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - *out = ( (*in) ) >> ( 22 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 22 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 22 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 22 - 18 ); - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 22 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 22 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 22 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask23_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 23 ; - ++out; - *out = ( (*in) ) >> ( 23 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - *out = ( (*in) ) >> ( 23 - 5 ); - ++in; - *out |= ( (*in) ) << 5 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 23 - 19 ); - ++in; - *out |= ( (*in) ) << 19 ; - ++out; - *out = ( (*in) ) >> ( 23 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++out; - *out = ( (*in) ) >> ( 23 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 23 - 15 ); - ++in; - *out |= ( (*in) ) << 15 ; - ++out; - *out = ( (*in) ) >> ( 23 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 23 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 23 - 11 ); - ++in; - *out |= ( (*in) ) << 11 ; - ++out; - *out = ( (*in) ) >> ( 23 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 25 ; - ++out; - *out = ( (*in) ) >> ( 23 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 23 - 7 ); - ++in; - *out |= ( (*in) ) << 7 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 23 - 21 ); - ++in; - *out |= ( (*in) ) << 21 ; - ++out; - *out = ( (*in) ) >> ( 23 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 23 - 3 ); - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 23 - 17 ); - ++in; - *out |= ( (*in) ) << 17 ; - ++out; - *out = ( (*in) ) >> ( 23 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 23 - 22 ); - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 23 - 13 ); - ++in; - *out |= ( (*in) ) << 13 ; - ++out; - *out = ( (*in) ) >> ( 23 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - *out = ( (*in) ) >> ( 23 - 18 ); - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 23 - 9 ); - ++in; - *out |= ( (*in) ) << 9 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask24_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 24 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 24 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 24 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 24 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 24 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 24 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 24 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 24 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 24 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 24 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 24 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 24 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 24 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 24 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 24 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 24 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask25_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 25 ; - ++out; - *out = ( (*in) ) >> ( 25 - 18 ); - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 25 - 11 ); - ++in; - *out |= ( (*in) ) << 11 ; - ++out; - *out = ( (*in) ) >> ( 25 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 25 - 22 ); - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 25 - 15 ); - ++in; - *out |= ( (*in) ) << 15 ; - ++out; - *out = ( (*in) ) >> ( 25 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - *out = ( (*in) ) >> ( 25 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 25 - 19 ); - ++in; - *out |= ( (*in) ) << 19 ; - ++out; - *out = ( (*in) ) >> ( 25 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 25 - 5 ); - ++in; - *out |= ( (*in) ) << 5 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 25 - 23 ); - ++in; - *out |= ( (*in) ) << 23 ; - ++out; - *out = ( (*in) ) >> ( 25 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 25 - 9 ); - ++in; - *out |= ( (*in) ) << 9 ; - ++out; - *out = ( (*in) ) >> ( 25 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - *out = ( (*in) ) >> ( 25 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 25 - 13 ); - ++in; - *out |= ( (*in) ) << 13 ; - ++out; - *out = ( (*in) ) >> ( 25 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 25 - 24 ); - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 25 - 17 ); - ++in; - *out |= ( (*in) ) << 17 ; - ++out; - *out = ( (*in) ) >> ( 25 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++out; - *out = ( (*in) ) >> ( 25 - 3 ); - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 25 - 21 ); - ++in; - *out |= ( (*in) ) << 21 ; - ++out; - *out = ( (*in) ) >> ( 25 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - *out = ( (*in) ) >> ( 25 - 7 ); - ++in; - *out |= ( (*in) ) << 7 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask26_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 26 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 26 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - *out = ( (*in) ) >> ( 26 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - *out = ( (*in) ) >> ( 26 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 26 - 22 ); - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 26 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 26 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++out; - *out = ( (*in) ) >> ( 26 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 26 - 24 ); - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 26 - 18 ); - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 26 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 26 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 26 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 26 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - *out = ( (*in) ) >> ( 26 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - *out = ( (*in) ) >> ( 26 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 26 - 22 ); - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 26 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 26 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++out; - *out = ( (*in) ) >> ( 26 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 26 - 24 ); - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 26 - 18 ); - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 26 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 26 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask27_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - *out = ( (*in) ) >> ( 27 - 22 ); - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 27 - 17 ); - ++in; - *out |= ( (*in) ) << 17 ; - ++out; - *out = ( (*in) ) >> ( 27 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 27 - 7 ); - ++in; - *out |= ( (*in) ) << 7 ; - ++out; - *out = ( (*in) ) >> ( 27 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 27 - 24 ); - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 27 - 19 ); - ++in; - *out |= ( (*in) ) << 19 ; - ++out; - *out = ( (*in) ) >> ( 27 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - *out = ( (*in) ) >> ( 27 - 9 ); - ++in; - *out |= ( (*in) ) << 9 ; - ++out; - *out = ( (*in) ) >> ( 27 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 27 - 26 ); - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 27 - 21 ); - ++in; - *out |= ( (*in) ) << 21 ; - ++out; - *out = ( (*in) ) >> ( 27 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 27 - 11 ); - ++in; - *out |= ( (*in) ) << 11 ; - ++out; - *out = ( (*in) ) >> ( 27 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++out; - *out = ( (*in) ) >> ( 27 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 27 - 23 ); - ++in; - *out |= ( (*in) ) << 23 ; - ++out; - *out = ( (*in) ) >> ( 27 - 18 ); - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 27 - 13 ); - ++in; - *out |= ( (*in) ) << 13 ; - ++out; - *out = ( (*in) ) >> ( 27 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - *out = ( (*in) ) >> ( 27 - 3 ); - ++in; - *out |= ( (*in) ) << 3 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 27 - 25 ); - ++in; - *out |= ( (*in) ) << 25 ; - ++out; - *out = ( (*in) ) >> ( 27 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 27 - 15 ); - ++in; - *out |= ( (*in) ) << 15 ; - ++out; - *out = ( (*in) ) >> ( 27 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++out; - *out = ( (*in) ) >> ( 27 - 5 ); - ++in; - *out |= ( (*in) ) << 5 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask28_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 28 - 24 ); - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 28 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 28 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 28 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 28 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - *out = ( (*in) ) >> ( 28 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 28 - 24 ); - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 28 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 28 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 28 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 28 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - *out = ( (*in) ) >> ( 28 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 28 - 24 ); - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 28 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 28 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 28 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 28 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - *out = ( (*in) ) >> ( 28 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 28 - 24 ); - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 28 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 28 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 28 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 28 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - *out = ( (*in) ) >> ( 28 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask29_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 29 - 26 ); - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 29 - 23 ); - ++in; - *out |= ( (*in) ) << 23 ; - ++out; - *out = ( (*in) ) >> ( 29 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 29 - 17 ); - ++in; - *out |= ( (*in) ) << 17 ; - ++out; - *out = ( (*in) ) >> ( 29 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - *out = ( (*in) ) >> ( 29 - 11 ); - ++in; - *out |= ( (*in) ) << 11 ; - ++out; - *out = ( (*in) ) >> ( 29 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - *out = ( (*in) ) >> ( 29 - 5 ); - ++in; - *out |= ( (*in) ) << 5 ; - ++out; - *out = ( (*in) ) >> ( 29 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 29 - 28 ); - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 29 - 25 ); - ++in; - *out |= ( (*in) ) << 25 ; - ++out; - *out = ( (*in) ) >> ( 29 - 22 ); - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 29 - 19 ); - ++in; - *out |= ( (*in) ) << 19 ; - ++out; - *out = ( (*in) ) >> ( 29 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 29 - 13 ); - ++in; - *out |= ( (*in) ) << 13 ; - ++out; - *out = ( (*in) ) >> ( 29 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++out; - *out = ( (*in) ) >> ( 29 - 7 ); - ++in; - *out |= ( (*in) ) << 7 ; - ++out; - *out = ( (*in) ) >> ( 29 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++out; - *out = ( (*in) ) >> ( 29 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 29 - 27 ); - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - *out = ( (*in) ) >> ( 29 - 24 ); - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 29 - 21 ); - ++in; - *out |= ( (*in) ) << 21 ; - ++out; - *out = ( (*in) ) >> ( 29 - 18 ); - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 29 - 15 ); - ++in; - *out |= ( (*in) ) << 15 ; - ++out; - *out = ( (*in) ) >> ( 29 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 29 - 9 ); - ++in; - *out |= ( (*in) ) << 9 ; - ++out; - *out = ( (*in) ) >> ( 29 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++out; - *out = ( (*in) ) >> ( 29 - 3 ); - ++in; - *out |= ( (*in) ) << 3 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask30_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 30 - 28 ); - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 30 - 26 ); - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 30 - 24 ); - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 30 - 22 ); - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 30 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 30 - 18 ); - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 30 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 30 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - *out = ( (*in) ) >> ( 30 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 30 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++out; - *out = ( (*in) ) >> ( 30 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - *out = ( (*in) ) >> ( 30 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++out; - *out = ( (*in) ) >> ( 30 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++out; - *out = ( (*in) ) >> ( 30 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++out; - ++in; - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 30 - 28 ); - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 30 - 26 ); - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 30 - 24 ); - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 30 - 22 ); - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 30 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 30 - 18 ); - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 30 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 30 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - *out = ( (*in) ) >> ( 30 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 30 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++out; - *out = ( (*in) ) >> ( 30 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - *out = ( (*in) ) >> ( 30 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++out; - *out = ( (*in) ) >> ( 30 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++out; - *out = ( (*in) ) >> ( 30 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask31_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++in; - *out |= ( (*in) ) << 31 ; - ++out; - *out = ( (*in) ) >> ( 31 - 30 ); - ++in; - *out |= ( (*in) ) << 30 ; - ++out; - *out = ( (*in) ) >> ( 31 - 29 ); - ++in; - *out |= ( (*in) ) << 29 ; - ++out; - *out = ( (*in) ) >> ( 31 - 28 ); - ++in; - *out |= ( (*in) ) << 28 ; - ++out; - *out = ( (*in) ) >> ( 31 - 27 ); - ++in; - *out |= ( (*in) ) << 27 ; - ++out; - *out = ( (*in) ) >> ( 31 - 26 ); - ++in; - *out |= ( (*in) ) << 26 ; - ++out; - *out = ( (*in) ) >> ( 31 - 25 ); - ++in; - *out |= ( (*in) ) << 25 ; - ++out; - *out = ( (*in) ) >> ( 31 - 24 ); - ++in; - *out |= ( (*in) ) << 24 ; - ++out; - *out = ( (*in) ) >> ( 31 - 23 ); - ++in; - *out |= ( (*in) ) << 23 ; - ++out; - *out = ( (*in) ) >> ( 31 - 22 ); - ++in; - *out |= ( (*in) ) << 22 ; - ++out; - *out = ( (*in) ) >> ( 31 - 21 ); - ++in; - *out |= ( (*in) ) << 21 ; - ++out; - *out = ( (*in) ) >> ( 31 - 20 ); - ++in; - *out |= ( (*in) ) << 20 ; - ++out; - *out = ( (*in) ) >> ( 31 - 19 ); - ++in; - *out |= ( (*in) ) << 19 ; - ++out; - *out = ( (*in) ) >> ( 31 - 18 ); - ++in; - *out |= ( (*in) ) << 18 ; - ++out; - *out = ( (*in) ) >> ( 31 - 17 ); - ++in; - *out |= ( (*in) ) << 17 ; - ++out; - *out = ( (*in) ) >> ( 31 - 16 ); - ++in; - *out |= ( (*in) ) << 16 ; - ++out; - *out = ( (*in) ) >> ( 31 - 15 ); - ++in; - *out |= ( (*in) ) << 15 ; - ++out; - *out = ( (*in) ) >> ( 31 - 14 ); - ++in; - *out |= ( (*in) ) << 14 ; - ++out; - *out = ( (*in) ) >> ( 31 - 13 ); - ++in; - *out |= ( (*in) ) << 13 ; - ++out; - *out = ( (*in) ) >> ( 31 - 12 ); - ++in; - *out |= ( (*in) ) << 12 ; - ++out; - *out = ( (*in) ) >> ( 31 - 11 ); - ++in; - *out |= ( (*in) ) << 11 ; - ++out; - *out = ( (*in) ) >> ( 31 - 10 ); - ++in; - *out |= ( (*in) ) << 10 ; - ++out; - *out = ( (*in) ) >> ( 31 - 9 ); - ++in; - *out |= ( (*in) ) << 9 ; - ++out; - *out = ( (*in) ) >> ( 31 - 8 ); - ++in; - *out |= ( (*in) ) << 8 ; - ++out; - *out = ( (*in) ) >> ( 31 - 7 ); - ++in; - *out |= ( (*in) ) << 7 ; - ++out; - *out = ( (*in) ) >> ( 31 - 6 ); - ++in; - *out |= ( (*in) ) << 6 ; - ++out; - *out = ( (*in) ) >> ( 31 - 5 ); - ++in; - *out |= ( (*in) ) << 5 ; - ++out; - *out = ( (*in) ) >> ( 31 - 4 ); - ++in; - *out |= ( (*in) ) << 4 ; - ++out; - *out = ( (*in) ) >> ( 31 - 3 ); - ++in; - *out |= ( (*in) ) << 3 ; - ++out; - *out = ( (*in) ) >> ( 31 - 2 ); - ++in; - *out |= ( (*in) ) << 2 ; - ++out; - *out = ( (*in) ) >> ( 31 - 1 ); - ++in; - *out |= ( (*in) ) << 1 ; - ++out; - ++in; - - return out; - } - - - - uint32_t * __fastpackwithoutmask32_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - *out = (*in) ; - ++out; - ++in; - - return out; - } - -#if 1 -#define DST(__x) out[__x] -#define DSI -#else -#define DST(__x) *out++ -#define DSI -#endif - -const uint32_t * __fastunpack1_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - DST( 0) = ( (*in) >> 0 ) & 1 ; - DSI; - DST( 1) = ( (*in) >> 1 ) & 1 ; - DSI; - DST( 2) = ( (*in) >> 2 ) & 1 ; - DSI; - DST( 3) = ( (*in) >> 3 ) & 1 ; - DSI; - DST( 4) = ( (*in) >> 4 ) & 1 ; - DSI; - DST( 5) = ( (*in) >> 5 ) & 1 ; - DSI; - DST( 6) = ( (*in) >> 6 ) & 1 ; - DSI; - DST( 7) = ( (*in) >> 7 ) & 1 ; - DSI; - DST( 8) = ( (*in) >> 8 ) & 1 ; - DSI; - DST( 9) = ( (*in) >> 9 ) & 1 ; - DSI; - DST(10) = ( (*in) >> 10 ) & 1 ; - DSI; - DST(11) = ( (*in) >> 11 ) & 1 ; - DSI; - DST(12) = ( (*in) >> 12 ) & 1 ; - DSI; - DST(13) = ( (*in) >> 13 ) & 1 ; - DSI; - DST(14) = ( (*in) >> 14 ) & 1 ; - DSI; - DST(15) = ( (*in) >> 15 ) & 1 ; - DSI; - DST(16) = ( (*in) >> 16 ) & 1 ; - DSI; - DST(17) = ( (*in) >> 17 ) & 1 ; - DSI; - DST(18) = ( (*in) >> 18 ) & 1 ; - DSI; - DST(19) = ( (*in) >> 19 ) & 1 ; - DSI; - DST(20) = ( (*in) >> 20 ) & 1 ; - DSI; - DST(21) = ( (*in) >> 21 ) & 1 ; - DSI; - DST(22) = ( (*in) >> 22 ) & 1 ; - DSI; - DST(23) = ( (*in) >> 23 ) & 1 ; - DSI; - DST(24) = ( (*in) >> 24 ) & 1 ; - DSI; - DST(25) = ( (*in) >> 25 ) & 1 ; - DSI; - DST(26) = ( (*in) >> 26 ) & 1 ; - DSI; - DST(27) = ( (*in) >> 27 ) & 1 ; - DSI; - DST(28) = ( (*in) >> 28 ) & 1 ; - DSI; - DST(29) = ( (*in) >> 29 ) & 1 ; - DSI; - DST(30) = ( (*in) >> 30 ) & 1 ; - DSI; - DST(31) = ( (*in) >> 31 ) ; - ++in; - DSI; - - return in; - } - - - - -const uint32_t * __fastunpack2_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - DST( 0) = ( (*in) >> 0 ) % (1U << 2 ) ; - DSI; - DST( 1) = ( (*in) >> 2 ) % (1U << 2 ) ; - DSI; - DST( 2) = ( (*in) >> 4 ) % (1U << 2 ) ; - DSI; - DST( 3) = ( (*in) >> 6 ) % (1U << 2 ) ; - DSI; - DST( 4) = ( (*in) >> 8 ) % (1U << 2 ) ; - DSI; - DST( 5) = ( (*in) >> 10 ) % (1U << 2 ) ; - DSI; - DST( 6) = ( (*in) >> 12 ) % (1U << 2 ) ; - DSI; - DST( 7) = ( (*in) >> 14 ) % (1U << 2 ) ; - DSI; - DST( 8) = ( (*in) >> 16 ) % (1U << 2 ) ; - DSI; - DST( 9) = ( (*in) >> 18 ) % (1U << 2 ) ; - DSI; - DST(10) = ( (*in) >> 20 ) % (1U << 2 ) ; - DSI; - DST(11) = ( (*in) >> 22 ) % (1U << 2 ) ; - DSI; - DST(12) = ( (*in) >> 24 ) % (1U << 2 ) ; - DSI; - DST(13) = ( (*in) >> 26 ) % (1U << 2 ) ; - DSI; - DST(14) = ( (*in) >> 28 ) % (1U << 2 ) ; - DSI; - DST(15) = ( (*in) >> 30 ) ; - ++in; - DSI; - DST(16) = ( (*in) >> 0 ) % (1U << 2 ) ; - DSI; - DST(17) = ( (*in) >> 2 ) % (1U << 2 ) ; - DSI; - DST(18) = ( (*in) >> 4 ) % (1U << 2 ) ; - DSI; - DST(19) = ( (*in) >> 6 ) % (1U << 2 ) ; - DSI; - DST(20) = ( (*in) >> 8 ) % (1U << 2 ) ; - DSI; - DST(21) = ( (*in) >> 10 ) % (1U << 2 ) ; - DSI; - DST(22) = ( (*in) >> 12 ) % (1U << 2 ) ; - DSI; - DST(23) = ( (*in) >> 14 ) % (1U << 2 ) ; - DSI; - DST(24) = ( (*in) >> 16 ) % (1U << 2 ) ; - DSI; - DST(25) = ( (*in) >> 18 ) % (1U << 2 ) ; - DSI; - DST(26) = ( (*in) >> 20 ) % (1U << 2 ) ; - DSI; - DST(27) = ( (*in) >> 22 ) % (1U << 2 ) ; - DSI; - DST(28) = ( (*in) >> 24 ) % (1U << 2 ) ; - DSI; - DST(29) = ( (*in) >> 26 ) % (1U << 2 ) ; - DSI; - DST(30) = ( (*in) >> 28 ) % (1U << 2 ) ; - DSI; - DST(31) = ( (*in) >> 30 ) ; - ++in; - DSI; - - return in; - } - - - - -const uint32_t * __fastunpack3_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - DST( 0) = ( (*in) >> 0 ) % (1U << 3 ) ; - DSI; - DST( 1) = ( (*in) >> 3 ) % (1U << 3 ) ; - DSI; - DST( 2) = ( (*in) >> 6 ) % (1U << 3 ) ; - DSI; - DST( 3) = ( (*in) >> 9 ) % (1U << 3 ) ; - DSI; - DST( 4) = ( (*in) >> 12 ) % (1U << 3 ) ; - DSI; - DST( 5) = ( (*in) >> 15 ) % (1U << 3 ) ; - DSI; - DST( 6) = ( (*in) >> 18 ) % (1U << 3 ) ; - DSI; - DST( 7) = ( (*in) >> 21 ) % (1U << 3 ) ; - DSI; - DST( 8) = ( (*in) >> 24 ) % (1U << 3 ) ; - DSI; - DST( 9) = ( (*in) >> 27 ) % (1U << 3 ) ; - DSI; - DST(10) = ( (*in) >> 30 ) ; - ++in; - DST(10) |= ((*in) % (1U<< 1 ))<<( 3 - 1 ); - DSI; - DST(11) = ( (*in) >> 1 ) % (1U << 3 ) ; - DSI; - DST(12) = ( (*in) >> 4 ) % (1U << 3 ) ; - DSI; - DST(13) = ( (*in) >> 7 ) % (1U << 3 ) ; - DSI; - DST(14) = ( (*in) >> 10 ) % (1U << 3 ) ; - DSI; - DST(15) = ( (*in) >> 13 ) % (1U << 3 ) ; - DSI; - DST(16) = ( (*in) >> 16 ) % (1U << 3 ) ; - DSI; - DST(17) = ( (*in) >> 19 ) % (1U << 3 ) ; - DSI; - DST(18) = ( (*in) >> 22 ) % (1U << 3 ) ; - DSI; - DST(19) = ( (*in) >> 25 ) % (1U << 3 ) ; - DSI; - DST(20) = ( (*in) >> 28 ) % (1U << 3 ) ; - DSI; - DST(21) = ( (*in) >> 31 ) ; - ++in; - DST(21) |= ((*in) % (1U<< 2 ))<<( 3 - 2 ); - DSI; - DST(22) = ( (*in) >> 2 ) % (1U << 3 ) ; - DSI; - DST(23) = ( (*in) >> 5 ) % (1U << 3 ) ; - DSI; - DST(24) = ( (*in) >> 8 ) % (1U << 3 ) ; - DSI; - DST(25) = ( (*in) >> 11 ) % (1U << 3 ) ; - DSI; - DST(26) = ( (*in) >> 14 ) % (1U << 3 ) ; - DSI; - DST(27) = ( (*in) >> 17 ) % (1U << 3 ) ; - DSI; - DST(28) = ( (*in) >> 20 ) % (1U << 3 ) ; - DSI; - DST(29) = ( (*in) >> 23 ) % (1U << 3 ) ; - DSI; - DST(30) = ( (*in) >> 26 ) % (1U << 3 ) ; - DSI; - DST(31) = ( (*in) >> 29 ) ; - ++in; - DSI; - - return in; - } - - - - -const uint32_t * __fastunpack4_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - DST( 0) = ( (*in) >> 0 ) % (1U << 4 ) ; - DSI; - DST( 1) = ( (*in) >> 4 ) % (1U << 4 ) ; - DSI; - DST( 2) = ( (*in) >> 8 ) % (1U << 4 ) ; - DSI; - DST( 3) = ( (*in) >> 12 ) % (1U << 4 ) ; - DSI; - DST( 4) = ( (*in) >> 16 ) % (1U << 4 ) ; - DSI; - DST( 5) = ( (*in) >> 20 ) % (1U << 4 ) ; - DSI; - DST( 6) = ( (*in) >> 24 ) % (1U << 4 ) ; - DSI; - DST( 7) = ( (*in) >> 28 ) ; - ++in; - DSI; - DST( 8) = ( (*in) >> 0 ) % (1U << 4 ) ; - DSI; - DST( 9) = ( (*in) >> 4 ) % (1U << 4 ) ; - DSI; - DST(10) = ( (*in) >> 8 ) % (1U << 4 ) ; - DSI; - DST(11) = ( (*in) >> 12 ) % (1U << 4 ) ; - DSI; - DST(12) = ( (*in) >> 16 ) % (1U << 4 ) ; - DSI; - DST(13) = ( (*in) >> 20 ) % (1U << 4 ) ; - DSI; - DST(14) = ( (*in) >> 24 ) % (1U << 4 ) ; - DSI; - DST(15) = ( (*in) >> 28 ) ; - ++in; - DSI; - DST(16) = ( (*in) >> 0 ) % (1U << 4 ) ; - DSI; - DST(17) = ( (*in) >> 4 ) % (1U << 4 ) ; - DSI; - DST(18) = ( (*in) >> 8 ) % (1U << 4 ) ; - DSI; - DST(19) = ( (*in) >> 12 ) % (1U << 4 ) ; - DSI; - DST(20) = ( (*in) >> 16 ) % (1U << 4 ) ; - DSI; - DST(21) = ( (*in) >> 20 ) % (1U << 4 ) ; - DSI; - DST(22) = ( (*in) >> 24 ) % (1U << 4 ) ; - DSI; - DST(23) = ( (*in) >> 28 ) ; - ++in; - DSI; - DST(24) = ( (*in) >> 0 ) % (1U << 4 ) ; - DSI; - DST(25) = ( (*in) >> 4 ) % (1U << 4 ) ; - DSI; - DST(26) = ( (*in) >> 8 ) % (1U << 4 ) ; - DSI; - DST(27) = ( (*in) >> 12 ) % (1U << 4 ) ; - DSI; - DST(28) = ( (*in) >> 16 ) % (1U << 4 ) ; - DSI; - DST(29) = ( (*in) >> 20 ) % (1U << 4 ) ; - DSI; - DST(30) = ( (*in) >> 24 ) % (1U << 4 ) ; - DSI; - DST(31) = ( (*in) >> 28 ) ; - ++in; - DSI; - - return in; - } - - - - -const uint32_t * __fastunpack5_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - DST( 0) = ( (*in) >> 0 ) % (1U << 5 ) ; - DSI; - DST( 1) = ( (*in) >> 5 ) % (1U << 5 ) ; - DSI; - DST( 2) = ( (*in) >> 10 ) % (1U << 5 ) ; - DSI; - DST( 3) = ( (*in) >> 15 ) % (1U << 5 ) ; - DSI; - DST( 4) = ( (*in) >> 20 ) % (1U << 5 ) ; - DSI; - DST( 5) = ( (*in) >> 25 ) % (1U << 5 ) ; - DSI; - DST( 6) = ( (*in) >> 30 ) ; - ++in; - DST( 6) |= ((*in) % (1U<< 3 ))<<( 5 - 3 ); - DSI; - DST( 7) = ( (*in) >> 3 ) % (1U << 5 ) ; - DSI; - DST( 8) = ( (*in) >> 8 ) % (1U << 5 ) ; - DSI; - DST( 9) = ( (*in) >> 13 ) % (1U << 5 ) ; - DSI; - DST(10) = ( (*in) >> 18 ) % (1U << 5 ) ; - DSI; - DST(11) = ( (*in) >> 23 ) % (1U << 5 ) ; - DSI; - DST(12) = ( (*in) >> 28 ) ; - ++in; - DST(12) |= ((*in) % (1U<< 1 ))<<( 5 - 1 ); - DSI; - DST(13) = ( (*in) >> 1 ) % (1U << 5 ) ; - DSI; - DST(14) = ( (*in) >> 6 ) % (1U << 5 ) ; - DSI; - DST(15) = ( (*in) >> 11 ) % (1U << 5 ) ; - DSI; - DST(16) = ( (*in) >> 16 ) % (1U << 5 ) ; - DSI; - DST(17) = ( (*in) >> 21 ) % (1U << 5 ) ; - DSI; - DST(18) = ( (*in) >> 26 ) % (1U << 5 ) ; - DSI; - DST(19) = ( (*in) >> 31 ) ; - ++in; - DST(19) |= ((*in) % (1U<< 4 ))<<( 5 - 4 ); - DSI; - DST(20) = ( (*in) >> 4 ) % (1U << 5 ) ; - DSI; - DST(21) = ( (*in) >> 9 ) % (1U << 5 ) ; - DSI; - DST(22) = ( (*in) >> 14 ) % (1U << 5 ) ; - DSI; - DST(23) = ( (*in) >> 19 ) % (1U << 5 ) ; - DSI; - DST(24) = ( (*in) >> 24 ) % (1U << 5 ) ; - DSI; - DST(25) = ( (*in) >> 29 ) ; - ++in; - DST(25) |= ((*in) % (1U<< 2 ))<<( 5 - 2 ); - DSI; - DST(26) = ( (*in) >> 2 ) % (1U << 5 ) ; - DSI; - DST(27) = ( (*in) >> 7 ) % (1U << 5 ) ; - DSI; - DST(28) = ( (*in) >> 12 ) % (1U << 5 ) ; - DSI; - DST(29) = ( (*in) >> 17 ) % (1U << 5 ) ; - DSI; - DST(30) = ( (*in) >> 22 ) % (1U << 5 ) ; - DSI; - DST(31) = ( (*in) >> 27 ) ; - ++in; - DSI; - - return in; - } - - - - -const uint32_t * __fastunpack6_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - DST( 0) = ( (*in) >> 0 ) % (1U << 6 ) ; - DSI; - DST( 1) = ( (*in) >> 6 ) % (1U << 6 ) ; - DSI; - DST( 2) = ( (*in) >> 12 ) % (1U << 6 ) ; - DSI; - DST( 3) = ( (*in) >> 18 ) % (1U << 6 ) ; - DSI; - DST( 4) = ( (*in) >> 24 ) % (1U << 6 ) ; - DSI; - DST( 5) = ( (*in) >> 30 ) ; - ++in; - DST( 5) |= ((*in) % (1U<< 4 ))<<( 6 - 4 ); - DSI; - DST( 6) = ( (*in) >> 4 ) % (1U << 6 ) ; - DSI; - DST( 7) = ( (*in) >> 10 ) % (1U << 6 ) ; - DSI; - DST( 8) = ( (*in) >> 16 ) % (1U << 6 ) ; - DSI; - DST( 9) = ( (*in) >> 22 ) % (1U << 6 ) ; - DSI; - DST(10) = ( (*in) >> 28 ) ; - ++in; - DST(10) |= ((*in) % (1U<< 2 ))<<( 6 - 2 ); - DSI; - DST(11) = ( (*in) >> 2 ) % (1U << 6 ) ; - DSI; - DST(12) = ( (*in) >> 8 ) % (1U << 6 ) ; - DSI; - DST(13) = ( (*in) >> 14 ) % (1U << 6 ) ; - DSI; - DST(14) = ( (*in) >> 20 ) % (1U << 6 ) ; - DSI; - DST(15) = ( (*in) >> 26 ) ; - ++in; - DSI; - DST(16) = ( (*in) >> 0 ) % (1U << 6 ) ; - DSI; - DST(17) = ( (*in) >> 6 ) % (1U << 6 ) ; - DSI; - DST(18) = ( (*in) >> 12 ) % (1U << 6 ) ; - DSI; - DST(19) = ( (*in) >> 18 ) % (1U << 6 ) ; - DSI; - DST(20) = ( (*in) >> 24 ) % (1U << 6 ) ; - DSI; - DST(21) = ( (*in) >> 30 ) ; - ++in; - DST(21) |= ((*in) % (1U<< 4 ))<<( 6 - 4 ); - DSI; - DST(22) = ( (*in) >> 4 ) % (1U << 6 ) ; - DSI; - DST(23) = ( (*in) >> 10 ) % (1U << 6 ) ; - DSI; - DST(24) = ( (*in) >> 16 ) % (1U << 6 ) ; - DSI; - DST(25) = ( (*in) >> 22 ) % (1U << 6 ) ; - DSI; - DST(26) = ( (*in) >> 28 ) ; - ++in; - DST(26) |= ((*in) % (1U<< 2 ))<<( 6 - 2 ); - DSI; - DST(27) = ( (*in) >> 2 ) % (1U << 6 ) ; - DSI; - DST(28) = ( (*in) >> 8 ) % (1U << 6 ) ; - DSI; - DST(29) = ( (*in) >> 14 ) % (1U << 6 ) ; - DSI; - DST(30) = ( (*in) >> 20 ) % (1U << 6 ) ; - DSI; - DST(31) = ( (*in) >> 26 ) ; - ++in; - DSI; - - return in; - } - - - - -const uint32_t * __fastunpack7_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - DST( 0) = ( (*in) >> 0 ) % (1U << 7 ) ; - DSI; - DST( 1) = ( (*in) >> 7 ) % (1U << 7 ) ; - DSI; - DST( 2) = ( (*in) >> 14 ) % (1U << 7 ) ; - DSI; - DST( 3) = ( (*in) >> 21 ) % (1U << 7 ) ; - DSI; - DST( 4) = ( (*in) >> 28 ) ; - ++in; - DST( 4) |= ((*in) % (1U<< 3 ))<<( 7 - 3 ); - DSI; - DST( 5) = ( (*in) >> 3 ) % (1U << 7 ) ; - DSI; - DST( 6) = ( (*in) >> 10 ) % (1U << 7 ) ; - DSI; - DST( 7) = ( (*in) >> 17 ) % (1U << 7 ) ; - DSI; - DST( 8) = ( (*in) >> 24 ) % (1U << 7 ) ; - DSI; - DST( 9) = ( (*in) >> 31 ) ; - ++in; - DST( 9) |= ((*in) % (1U<< 6 ))<<( 7 - 6 ); - DSI; - DST(10) = ( (*in) >> 6 ) % (1U << 7 ) ; - DSI; - DST(11) = ( (*in) >> 13 ) % (1U << 7 ) ; - DSI; - DST(12) = ( (*in) >> 20 ) % (1U << 7 ) ; - DSI; - DST(13) = ( (*in) >> 27 ) ; - ++in; - DST(13) |= ((*in) % (1U<< 2 ))<<( 7 - 2 ); - DSI; - DST(14) = ( (*in) >> 2 ) % (1U << 7 ) ; - DSI; - DST(15) = ( (*in) >> 9 ) % (1U << 7 ) ; - DSI; - DST(16) = ( (*in) >> 16 ) % (1U << 7 ) ; - DSI; - DST(17) = ( (*in) >> 23 ) % (1U << 7 ) ; - DSI; - DST(18) = ( (*in) >> 30 ) ; - ++in; - DST(18) |= ((*in) % (1U<< 5 ))<<( 7 - 5 ); - DSI; - DST(19) = ( (*in) >> 5 ) % (1U << 7 ) ; - DSI; - DST(20) = ( (*in) >> 12 ) % (1U << 7 ) ; - DSI; - DST(21) = ( (*in) >> 19 ) % (1U << 7 ) ; - DSI; - DST(22) = ( (*in) >> 26 ) ; - ++in; - DST(22) |= ((*in) % (1U<< 1 ))<<( 7 - 1 ); - DSI; - DST(23) = ( (*in) >> 1 ) % (1U << 7 ) ; - DSI; - DST(24) = ( (*in) >> 8 ) % (1U << 7 ) ; - DSI; - DST(25) = ( (*in) >> 15 ) % (1U << 7 ) ; - DSI; - DST(26) = ( (*in) >> 22 ) % (1U << 7 ) ; - DSI; - DST(27) = ( (*in) >> 29 ) ; - ++in; - DST(27) |= ((*in) % (1U<< 4 ))<<( 7 - 4 ); - DSI; - DST(28) = ( (*in) >> 4 ) % (1U << 7 ) ; - DSI; - DST(29) = ( (*in) >> 11 ) % (1U << 7 ) ; - DSI; - DST(30) = ( (*in) >> 18 ) % (1U << 7 ) ; - DSI; - DST(31) = ( (*in) >> 25 ) ; - ++in; - DSI; - - return in; - } - - - - -const uint32_t * __fastunpack8_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - DST( 0) = ( (*in) >> 0 ) % (1U << 8 ) ; - DSI; - DST( 1) = ( (*in) >> 8 ) % (1U << 8 ) ; - DSI; - DST( 2) = ( (*in) >> 16 ) % (1U << 8 ) ; - DSI; - DST( 3) = ( (*in) >> 24 ) ; - ++in; - DSI; - DST( 4) = ( (*in) >> 0 ) % (1U << 8 ) ; - DSI; - DST( 5) = ( (*in) >> 8 ) % (1U << 8 ) ; - DSI; - DST( 6) = ( (*in) >> 16 ) % (1U << 8 ) ; - DSI; - DST( 7) = ( (*in) >> 24 ) ; - ++in; - DSI; - DST( 8) = ( (*in) >> 0 ) % (1U << 8 ) ; - DSI; - DST( 9) = ( (*in) >> 8 ) % (1U << 8 ) ; - DSI; - DST(10) = ( (*in) >> 16 ) % (1U << 8 ) ; - DSI; - DST(11) = ( (*in) >> 24 ) ; - ++in; - DSI; - DST(12) = ( (*in) >> 0 ) % (1U << 8 ) ; - DSI; - DST(13) = ( (*in) >> 8 ) % (1U << 8 ) ; - DSI; - DST(14) = ( (*in) >> 16 ) % (1U << 8 ) ; - DSI; - DST(15) = ( (*in) >> 24 ) ; - ++in; - DSI; - DST(16) = ( (*in) >> 0 ) % (1U << 8 ) ; - DSI; - DST(17) = ( (*in) >> 8 ) % (1U << 8 ) ; - DSI; - DST(18) = ( (*in) >> 16 ) % (1U << 8 ) ; - DSI; - DST(19) = ( (*in) >> 24 ) ; - ++in; - DSI; - DST(20) = ( (*in) >> 0 ) % (1U << 8 ) ; - DSI; - DST(21) = ( (*in) >> 8 ) % (1U << 8 ) ; - DSI; - DST(22) = ( (*in) >> 16 ) % (1U << 8 ) ; - DSI; - DST(23) = ( (*in) >> 24 ) ; - ++in; - DSI; - DST(24) = ( (*in) >> 0 ) % (1U << 8 ) ; - DSI; - DST(25) = ( (*in) >> 8 ) % (1U << 8 ) ; - DSI; - DST(26) = ( (*in) >> 16 ) % (1U << 8 ) ; - DSI; - DST(27) = ( (*in) >> 24 ) ; - ++in; - DSI; - DST(28) = ( (*in) >> 0 ) % (1U << 8 ) ; - DSI; - DST(29) = ( (*in) >> 8 ) % (1U << 8 ) ; - DSI; - DST(30) = ( (*in) >> 16 ) % (1U << 8 ) ; - DSI; - DST(31) = ( (*in) >> 24 ) ; - ++in; - DSI; - - return in; - } - - - - -const uint32_t * __fastunpack9_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - DST( 0) = ( (*in) >> 0 ) % (1U << 9 ) ; - DSI; - DST( 1) = ( (*in) >> 9 ) % (1U << 9 ) ; - DSI; - DST( 2) = ( (*in) >> 18 ) % (1U << 9 ) ; - DSI; - DST( 3) = ( (*in) >> 27 ) ; - ++in; - DST( 3) |= ((*in) % (1U<< 4 ))<<( 9 - 4 ); - DSI; - DST( 4) = ( (*in) >> 4 ) % (1U << 9 ) ; - DSI; - DST( 5) = ( (*in) >> 13 ) % (1U << 9 ) ; - DSI; - DST( 6) = ( (*in) >> 22 ) % (1U << 9 ) ; - DSI; - DST( 7) = ( (*in) >> 31 ) ; - ++in; - DST( 7) |= ((*in) % (1U<< 8 ))<<( 9 - 8 ); - DSI; - DST( 8) = ( (*in) >> 8 ) % (1U << 9 ) ; - DSI; - DST( 9) = ( (*in) >> 17 ) % (1U << 9 ) ; - DSI; - DST(10) = ( (*in) >> 26 ) ; - ++in; - DST(10) |= ((*in) % (1U<< 3 ))<<( 9 - 3 ); - DSI; - DST(11) = ( (*in) >> 3 ) % (1U << 9 ) ; - DSI; - DST(12) = ( (*in) >> 12 ) % (1U << 9 ) ; - DSI; - DST(13) = ( (*in) >> 21 ) % (1U << 9 ) ; - DSI; - DST(14) = ( (*in) >> 30 ) ; - ++in; - DST(14) |= ((*in) % (1U<< 7 ))<<( 9 - 7 ); - DSI; - DST(15) = ( (*in) >> 7 ) % (1U << 9 ) ; - DSI; - DST(16) = ( (*in) >> 16 ) % (1U << 9 ) ; - DSI; - DST(17) = ( (*in) >> 25 ) ; - ++in; - DST(17) |= ((*in) % (1U<< 2 ))<<( 9 - 2 ); - DSI; - DST(18) = ( (*in) >> 2 ) % (1U << 9 ) ; - DSI; - DST(19) = ( (*in) >> 11 ) % (1U << 9 ) ; - DSI; - DST(20) = ( (*in) >> 20 ) % (1U << 9 ) ; - DSI; - DST(21) = ( (*in) >> 29 ) ; - ++in; - DST(21) |= ((*in) % (1U<< 6 ))<<( 9 - 6 ); - DSI; - DST(22) = ( (*in) >> 6 ) % (1U << 9 ) ; - DSI; - DST(23) = ( (*in) >> 15 ) % (1U << 9 ) ; - DSI; - DST(24) = ( (*in) >> 24 ) ; - ++in; - DST(24) |= ((*in) % (1U<< 1 ))<<( 9 - 1 ); - DSI; - DST(25) = ( (*in) >> 1 ) % (1U << 9 ) ; - DSI; - DST(26) = ( (*in) >> 10 ) % (1U << 9 ) ; - DSI; - DST(27) = ( (*in) >> 19 ) % (1U << 9 ) ; - DSI; - DST(28) = ( (*in) >> 28 ) ; - ++in; - DST(28) |= ((*in) % (1U<< 5 ))<<( 9 - 5 ); - DSI; - DST(29) = ( (*in) >> 5 ) % (1U << 9 ) ; - DSI; - DST(30) = ( (*in) >> 14 ) % (1U << 9 ) ; - DSI; - DST(31) = ( (*in) >> 23 ) ; - ++in; - DSI; - - return in; - } - - - - -const uint32_t * __fastunpack10_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - DST( 0) = ( (*in) >> 0 ) % (1U << 10 ) ; - DSI; - DST( 1) = ( (*in) >> 10 ) % (1U << 10 ) ; - DSI; - DST( 2) = ( (*in) >> 20 ) % (1U << 10 ) ; - DSI; - DST( 3) = ( (*in) >> 30 ) ; - ++in; - DST( 3) |= ((*in) % (1U<< 8 ))<<( 10 - 8 ); - DSI; - DST( 4) = ( (*in) >> 8 ) % (1U << 10 ) ; - DSI; - DST( 5) = ( (*in) >> 18 ) % (1U << 10 ) ; - DSI; - DST( 6) = ( (*in) >> 28 ) ; - ++in; - DST( 6) |= ((*in) % (1U<< 6 ))<<( 10 - 6 ); - DSI; - DST( 7) = ( (*in) >> 6 ) % (1U << 10 ) ; - DSI; - DST( 8) = ( (*in) >> 16 ) % (1U << 10 ) ; - DSI; - DST( 9) = ( (*in) >> 26 ) ; - ++in; - DST( 9) |= ((*in) % (1U<< 4 ))<<( 10 - 4 ); - DSI; - DST(10) = ( (*in) >> 4 ) % (1U << 10 ) ; - DSI; - DST(11) = ( (*in) >> 14 ) % (1U << 10 ) ; - DSI; - DST(12) = ( (*in) >> 24 ) ; - ++in; - DST(12) |= ((*in) % (1U<< 2 ))<<( 10 - 2 ); - DSI; - DST(13) = ( (*in) >> 2 ) % (1U << 10 ) ; - DSI; - DST(14) = ( (*in) >> 12 ) % (1U << 10 ) ; - DSI; - DST(15) = ( (*in) >> 22 ) ; - ++in; - DSI; - DST(16) = ( (*in) >> 0 ) % (1U << 10 ) ; - DSI; - DST(17) = ( (*in) >> 10 ) % (1U << 10 ) ; - DSI; - DST(18) = ( (*in) >> 20 ) % (1U << 10 ) ; - DSI; - DST(19) = ( (*in) >> 30 ) ; - ++in; - DST(19) |= ((*in) % (1U<< 8 ))<<( 10 - 8 ); - DSI; - DST(20) = ( (*in) >> 8 ) % (1U << 10 ) ; - DSI; - DST(21) = ( (*in) >> 18 ) % (1U << 10 ) ; - DSI; - DST(22) = ( (*in) >> 28 ) ; - ++in; - DST(22) |= ((*in) % (1U<< 6 ))<<( 10 - 6 ); - DSI; - DST(23) = ( (*in) >> 6 ) % (1U << 10 ) ; - DSI; - DST(24) = ( (*in) >> 16 ) % (1U << 10 ) ; - DSI; - DST(25) = ( (*in) >> 26 ) ; - ++in; - DST(25) |= ((*in) % (1U<< 4 ))<<( 10 - 4 ); - DSI; - DST(26) = ( (*in) >> 4 ) % (1U << 10 ) ; - DSI; - DST(27) = ( (*in) >> 14 ) % (1U << 10 ) ; - DSI; - DST(28) = ( (*in) >> 24 ) ; - ++in; - DST(28) |= ((*in) % (1U<< 2 ))<<( 10 - 2 ); - DSI; - DST(29) = ( (*in) >> 2 ) % (1U << 10 ) ; - DSI; - DST(30) = ( (*in) >> 12 ) % (1U << 10 ) ; - DSI; - DST(31) = ( (*in) >> 22 ) ; - ++in; - DSI; - - return in; - } - - - - -const uint32_t * __fastunpack11_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - DST( 0) = ( (*in) >> 0 ) % (1U << 11 ) ; - DSI; - DST( 1) = ( (*in) >> 11 ) % (1U << 11 ) ; - DSI; - DST( 2) = ( (*in) >> 22 ) ; - ++in; - DST( 2) |= ((*in) % (1U<< 1 ))<<( 11 - 1 ); - DSI; - DST( 3) = ( (*in) >> 1 ) % (1U << 11 ) ; - DSI; - DST( 4) = ( (*in) >> 12 ) % (1U << 11 ) ; - DSI; - DST( 5) = ( (*in) >> 23 ) ; - ++in; - DST( 5) |= ((*in) % (1U<< 2 ))<<( 11 - 2 ); - DSI; - DST( 6) = ( (*in) >> 2 ) % (1U << 11 ) ; - DSI; - DST( 7) = ( (*in) >> 13 ) % (1U << 11 ) ; - DSI; - DST( 8) = ( (*in) >> 24 ) ; - ++in; - DST( 8) |= ((*in) % (1U<< 3 ))<<( 11 - 3 ); - DSI; - DST( 9) = ( (*in) >> 3 ) % (1U << 11 ) ; - DSI; - DST(10) = ( (*in) >> 14 ) % (1U << 11 ) ; - DSI; - DST(11) = ( (*in) >> 25 ) ; - ++in; - DST(11) |= ((*in) % (1U<< 4 ))<<( 11 - 4 ); - DSI; - DST(12) = ( (*in) >> 4 ) % (1U << 11 ) ; - DSI; - DST(13) = ( (*in) >> 15 ) % (1U << 11 ) ; - DSI; - DST(14) = ( (*in) >> 26 ) ; - ++in; - DST(14) |= ((*in) % (1U<< 5 ))<<( 11 - 5 ); - DSI; - DST(15) = ( (*in) >> 5 ) % (1U << 11 ) ; - DSI; - DST(16) = ( (*in) >> 16 ) % (1U << 11 ) ; - DSI; - DST(17) = ( (*in) >> 27 ) ; - ++in; - DST(17) |= ((*in) % (1U<< 6 ))<<( 11 - 6 ); - DSI; - DST(18) = ( (*in) >> 6 ) % (1U << 11 ) ; - DSI; - DST(19) = ( (*in) >> 17 ) % (1U << 11 ) ; - DSI; - DST(20) = ( (*in) >> 28 ) ; - ++in; - DST(20) |= ((*in) % (1U<< 7 ))<<( 11 - 7 ); - DSI; - DST(21) = ( (*in) >> 7 ) % (1U << 11 ) ; - DSI; - DST(22) = ( (*in) >> 18 ) % (1U << 11 ) ; - DSI; - DST(23) = ( (*in) >> 29 ) ; - ++in; - DST(23) |= ((*in) % (1U<< 8 ))<<( 11 - 8 ); - DSI; - DST(24) = ( (*in) >> 8 ) % (1U << 11 ) ; - DSI; - DST(25) = ( (*in) >> 19 ) % (1U << 11 ) ; - DSI; - DST(26) = ( (*in) >> 30 ) ; - ++in; - DST(26) |= ((*in) % (1U<< 9 ))<<( 11 - 9 ); - DSI; - DST(27) = ( (*in) >> 9 ) % (1U << 11 ) ; - DSI; - DST(28) = ( (*in) >> 20 ) % (1U << 11 ) ; - DSI; - DST(29) = ( (*in) >> 31 ) ; - ++in; - DST(29) |= ((*in) % (1U<< 10 ))<<( 11 - 10 ); - DSI; - DST(30) = ( (*in) >> 10 ) % (1U << 11 ) ; - DSI; - DST(31) = ( (*in) >> 21 ) ; - ++in; - DSI; - - return in; - } - - - - -const uint32_t * __fastunpack12_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - DST( 0) = ( (*in) >> 0 ) % (1U << 12 ) ; - DSI; - DST( 1) = ( (*in) >> 12 ) % (1U << 12 ) ; - DSI; - DST( 2) = ( (*in) >> 24 ) ; - ++in; - DST( 2) |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); - DSI; - DST( 3) = ( (*in) >> 4 ) % (1U << 12 ) ; - DSI; - DST( 4) = ( (*in) >> 16 ) % (1U << 12 ) ; - DSI; - DST( 5) = ( (*in) >> 28 ) ; - ++in; - DST( 5) |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); - DSI; - DST( 6) = ( (*in) >> 8 ) % (1U << 12 ) ; - DSI; - DST( 7) = ( (*in) >> 20 ) ; - ++in; - DSI; - DST( 8) = ( (*in) >> 0 ) % (1U << 12 ) ; - DSI; - DST( 9) = ( (*in) >> 12 ) % (1U << 12 ) ; - DSI; - DST(10) = ( (*in) >> 24 ) ; - ++in; - DST(10) |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); - DSI; - DST(11) = ( (*in) >> 4 ) % (1U << 12 ) ; - DSI; - DST(12) = ( (*in) >> 16 ) % (1U << 12 ) ; - DSI; - DST(13) = ( (*in) >> 28 ) ; - ++in; - DST(13) |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); - DSI; - DST(14) = ( (*in) >> 8 ) % (1U << 12 ) ; - DSI; - DST(15) = ( (*in) >> 20 ) ; - ++in; - DSI; - DST(16) = ( (*in) >> 0 ) % (1U << 12 ) ; - DSI; - DST(17) = ( (*in) >> 12 ) % (1U << 12 ) ; - DSI; - DST(18) = ( (*in) >> 24 ) ; - ++in; - DST(18) |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); - DSI; - DST(19) = ( (*in) >> 4 ) % (1U << 12 ) ; - DSI; - DST(20) = ( (*in) >> 16 ) % (1U << 12 ) ; - DSI; - DST(21) = ( (*in) >> 28 ) ; - ++in; - DST(21) |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); - DSI; - DST(22) = ( (*in) >> 8 ) % (1U << 12 ) ; - DSI; - DST(23) = ( (*in) >> 20 ) ; - ++in; - DSI; - DST(24) = ( (*in) >> 0 ) % (1U << 12 ) ; - DSI; - DST(25) = ( (*in) >> 12 ) % (1U << 12 ) ; - DSI; - DST(26) = ( (*in) >> 24 ) ; - ++in; - DST(26) |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); - DSI; - DST(27) = ( (*in) >> 4 ) % (1U << 12 ) ; - DSI; - DST(28) = ( (*in) >> 16 ) % (1U << 12 ) ; - DSI; - DST(29) = ( (*in) >> 28 ) ; - ++in; - DST(29) |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); - DSI; - DST(30) = ( (*in) >> 8 ) % (1U << 12 ) ; - DSI; - DST(31) = ( (*in) >> 20 ) ; - ++in; - DSI; - - return in; - } - - - - -const uint32_t * __fastunpack13_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - DST( 0) = ( (*in) >> 0 ) % (1U << 13 ) ; - DSI; - DST( 1) = ( (*in) >> 13 ) % (1U << 13 ) ; - DSI; - DST( 2) = ( (*in) >> 26 ) ; - ++in; - DST( 2) |= ((*in) % (1U<< 7 ))<<( 13 - 7 ); - DSI; - DST( 3) = ( (*in) >> 7 ) % (1U << 13 ) ; - DSI; - DST( 4) = ( (*in) >> 20 ) ; - ++in; - DST( 4) |= ((*in) % (1U<< 1 ))<<( 13 - 1 ); - DSI; - DST( 5) = ( (*in) >> 1 ) % (1U << 13 ) ; - DSI; - DST( 6) = ( (*in) >> 14 ) % (1U << 13 ) ; - DSI; - DST( 7) = ( (*in) >> 27 ) ; - ++in; - DST( 7) |= ((*in) % (1U<< 8 ))<<( 13 - 8 ); - DSI; - DST( 8) = ( (*in) >> 8 ) % (1U << 13 ) ; - DSI; - DST( 9) = ( (*in) >> 21 ) ; - ++in; - DST( 9) |= ((*in) % (1U<< 2 ))<<( 13 - 2 ); - DSI; - DST(10) = ( (*in) >> 2 ) % (1U << 13 ) ; - DSI; - DST(11) = ( (*in) >> 15 ) % (1U << 13 ) ; - DSI; - DST(12) = ( (*in) >> 28 ) ; - ++in; - DST(12) |= ((*in) % (1U<< 9 ))<<( 13 - 9 ); - DSI; - DST(13) = ( (*in) >> 9 ) % (1U << 13 ) ; - DSI; - DST(14) = ( (*in) >> 22 ) ; - ++in; - DST(14) |= ((*in) % (1U<< 3 ))<<( 13 - 3 ); - DSI; - DST(15) = ( (*in) >> 3 ) % (1U << 13 ) ; - DSI; - DST(16) = ( (*in) >> 16 ) % (1U << 13 ) ; - DSI; - DST(17) = ( (*in) >> 29 ) ; - ++in; - DST(17) |= ((*in) % (1U<< 10 ))<<( 13 - 10 ); - DSI; - DST(18) = ( (*in) >> 10 ) % (1U << 13 ) ; - DSI; - DST(19) = ( (*in) >> 23 ) ; - ++in; - DST(19) |= ((*in) % (1U<< 4 ))<<( 13 - 4 ); - DSI; - DST(20) = ( (*in) >> 4 ) % (1U << 13 ) ; - DSI; - DST(21) = ( (*in) >> 17 ) % (1U << 13 ) ; - DSI; - DST(22) = ( (*in) >> 30 ) ; - ++in; - DST(22) |= ((*in) % (1U<< 11 ))<<( 13 - 11 ); - DSI; - DST(23) = ( (*in) >> 11 ) % (1U << 13 ) ; - DSI; - DST(24) = ( (*in) >> 24 ) ; - ++in; - DST(24) |= ((*in) % (1U<< 5 ))<<( 13 - 5 ); - DSI; - DST(25) = ( (*in) >> 5 ) % (1U << 13 ) ; - DSI; - DST(26) = ( (*in) >> 18 ) % (1U << 13 ) ; - DSI; - DST(27) = ( (*in) >> 31 ) ; - ++in; - DST(27) |= ((*in) % (1U<< 12 ))<<( 13 - 12 ); - DSI; - DST(28) = ( (*in) >> 12 ) % (1U << 13 ) ; - DSI; - DST(29) = ( (*in) >> 25 ) ; - ++in; - DST(29) |= ((*in) % (1U<< 6 ))<<( 13 - 6 ); - DSI; - DST(30) = ( (*in) >> 6 ) % (1U << 13 ) ; - DSI; - DST(31) = ( (*in) >> 19 ) ; - ++in; - DSI; - - return in; - } - - - - -const uint32_t * __fastunpack14_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - DST( 0) = ( (*in) >> 0 ) % (1U << 14 ) ; - DSI; - DST( 1) = ( (*in) >> 14 ) % (1U << 14 ) ; - DSI; - DST( 2) = ( (*in) >> 28 ) ; - ++in; - DST( 2) |= ((*in) % (1U<< 10 ))<<( 14 - 10 ); - DSI; - DST( 3) = ( (*in) >> 10 ) % (1U << 14 ) ; - DSI; - DST( 4) = ( (*in) >> 24 ) ; - ++in; - DST( 4) |= ((*in) % (1U<< 6 ))<<( 14 - 6 ); - DSI; - DST( 5) = ( (*in) >> 6 ) % (1U << 14 ) ; - DSI; - DST( 6) = ( (*in) >> 20 ) ; - ++in; - DST( 6) |= ((*in) % (1U<< 2 ))<<( 14 - 2 ); - DSI; - DST( 7) = ( (*in) >> 2 ) % (1U << 14 ) ; - DSI; - DST( 8) = ( (*in) >> 16 ) % (1U << 14 ) ; - DSI; - DST( 9) = ( (*in) >> 30 ) ; - ++in; - DST( 9) |= ((*in) % (1U<< 12 ))<<( 14 - 12 ); - DSI; - DST(10) = ( (*in) >> 12 ) % (1U << 14 ) ; - DSI; - DST(11) = ( (*in) >> 26 ) ; - ++in; - DST(11) |= ((*in) % (1U<< 8 ))<<( 14 - 8 ); - DSI; - DST(12) = ( (*in) >> 8 ) % (1U << 14 ) ; - DSI; - DST(13) = ( (*in) >> 22 ) ; - ++in; - DST(13) |= ((*in) % (1U<< 4 ))<<( 14 - 4 ); - DSI; - DST(14) = ( (*in) >> 4 ) % (1U << 14 ) ; - DSI; - DST(15) = ( (*in) >> 18 ) ; - ++in; - DSI; - DST(16) = ( (*in) >> 0 ) % (1U << 14 ) ; - DSI; - DST(17) = ( (*in) >> 14 ) % (1U << 14 ) ; - DSI; - DST(18) = ( (*in) >> 28 ) ; - ++in; - DST(18) |= ((*in) % (1U<< 10 ))<<( 14 - 10 ); - DSI; - DST(19) = ( (*in) >> 10 ) % (1U << 14 ) ; - DSI; - DST(20) = ( (*in) >> 24 ) ; - ++in; - DST(20) |= ((*in) % (1U<< 6 ))<<( 14 - 6 ); - DSI; - DST(21) = ( (*in) >> 6 ) % (1U << 14 ) ; - DSI; - DST(22) = ( (*in) >> 20 ) ; - ++in; - DST(22) |= ((*in) % (1U<< 2 ))<<( 14 - 2 ); - DSI; - DST(23) = ( (*in) >> 2 ) % (1U << 14 ) ; - DSI; - DST(24) = ( (*in) >> 16 ) % (1U << 14 ) ; - DSI; - DST(25) = ( (*in) >> 30 ) ; - ++in; - DST(25) |= ((*in) % (1U<< 12 ))<<( 14 - 12 ); - DSI; - DST(26) = ( (*in) >> 12 ) % (1U << 14 ) ; - DSI; - DST(27) = ( (*in) >> 26 ) ; - ++in; - DST(27) |= ((*in) % (1U<< 8 ))<<( 14 - 8 ); - DSI; - DST(28) = ( (*in) >> 8 ) % (1U << 14 ) ; - DSI; - DST(29) = ( (*in) >> 22 ) ; - ++in; - DST(29) |= ((*in) % (1U<< 4 ))<<( 14 - 4 ); - DSI; - DST(30) = ( (*in) >> 4 ) % (1U << 14 ) ; - DSI; - DST(31) = ( (*in) >> 18 ) ; - ++in; - DSI; - - return in; - } - - - - -const uint32_t * __fastunpack15_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - DST( 0) = ( (*in) >> 0 ) % (1U << 15 ) ; - DSI; - DST( 1) = ( (*in) >> 15 ) % (1U << 15 ) ; - DSI; - DST( 2) = ( (*in) >> 30 ) ; - ++in; - DST( 2) |= ((*in) % (1U<< 13 ))<<( 15 - 13 ); - DSI; - DST( 3) = ( (*in) >> 13 ) % (1U << 15 ) ; - DSI; - DST( 4) = ( (*in) >> 28 ) ; - ++in; - DST( 4) |= ((*in) % (1U<< 11 ))<<( 15 - 11 ); - DSI; - DST( 5) = ( (*in) >> 11 ) % (1U << 15 ) ; - DSI; - DST( 6) = ( (*in) >> 26 ) ; - ++in; - DST( 6) |= ((*in) % (1U<< 9 ))<<( 15 - 9 ); - DSI; - DST( 7) = ( (*in) >> 9 ) % (1U << 15 ) ; - DSI; - DST( 8) = ( (*in) >> 24 ) ; - ++in; - DST( 8) |= ((*in) % (1U<< 7 ))<<( 15 - 7 ); - DSI; - DST( 9) = ( (*in) >> 7 ) % (1U << 15 ) ; - DSI; - DST(10) = ( (*in) >> 22 ) ; - ++in; - DST(10) |= ((*in) % (1U<< 5 ))<<( 15 - 5 ); - DSI; - DST(11) = ( (*in) >> 5 ) % (1U << 15 ) ; - DSI; - DST(12) = ( (*in) >> 20 ) ; - ++in; - DST(12) |= ((*in) % (1U<< 3 ))<<( 15 - 3 ); - DSI; - DST(13) = ( (*in) >> 3 ) % (1U << 15 ) ; - DSI; - DST(14) = ( (*in) >> 18 ) ; - ++in; - DST(14) |= ((*in) % (1U<< 1 ))<<( 15 - 1 ); - DSI; - DST(15) = ( (*in) >> 1 ) % (1U << 15 ) ; - DSI; - DST(16) = ( (*in) >> 16 ) % (1U << 15 ) ; - DSI; - DST(17) = ( (*in) >> 31 ) ; - ++in; - DST(17) |= ((*in) % (1U<< 14 ))<<( 15 - 14 ); - DSI; - DST(18) = ( (*in) >> 14 ) % (1U << 15 ) ; - DSI; - DST(19) = ( (*in) >> 29 ) ; - ++in; - DST(19) |= ((*in) % (1U<< 12 ))<<( 15 - 12 ); - DSI; - DST(20) = ( (*in) >> 12 ) % (1U << 15 ) ; - DSI; - DST(21) = ( (*in) >> 27 ) ; - ++in; - DST(21) |= ((*in) % (1U<< 10 ))<<( 15 - 10 ); - DSI; - DST(22) = ( (*in) >> 10 ) % (1U << 15 ) ; - DSI; - DST(23) = ( (*in) >> 25 ) ; - ++in; - DST(23) |= ((*in) % (1U<< 8 ))<<( 15 - 8 ); - DSI; - DST(24) = ( (*in) >> 8 ) % (1U << 15 ) ; - DSI; - DST(25) = ( (*in) >> 23 ) ; - ++in; - DST(25) |= ((*in) % (1U<< 6 ))<<( 15 - 6 ); - DSI; - DST(26) = ( (*in) >> 6 ) % (1U << 15 ) ; - DSI; - DST(27) = ( (*in) >> 21 ) ; - ++in; - DST(27) |= ((*in) % (1U<< 4 ))<<( 15 - 4 ); - DSI; - DST(28) = ( (*in) >> 4 ) % (1U << 15 ) ; - DSI; - DST(29) = ( (*in) >> 19 ) ; - ++in; - DST(29) |= ((*in) % (1U<< 2 ))<<( 15 - 2 ); - DSI; - DST(30) = ( (*in) >> 2 ) % (1U << 15 ) ; - DSI; - DST(31) = ( (*in) >> 17 ) ; - ++in; - DSI; - - return in; - } - - - - -const uint32_t * __fastunpack16_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - DST( 0) = ( (*in) >> 0 ) % (1U << 16 ) ; - DSI; - DST( 1) = ( (*in) >> 16 ) ; - ++in; - DSI; - DST( 2) = ( (*in) >> 0 ) % (1U << 16 ) ; - DSI; - DST( 3) = ( (*in) >> 16 ) ; - ++in; - DSI; - DST( 4) = ( (*in) >> 0 ) % (1U << 16 ) ; - DSI; - DST( 5) = ( (*in) >> 16 ) ; - ++in; - DSI; - DST( 6) = ( (*in) >> 0 ) % (1U << 16 ) ; - DSI; - DST( 7) = ( (*in) >> 16 ) ; - ++in; - DSI; - DST( 8) = ( (*in) >> 0 ) % (1U << 16 ) ; - DSI; - DST( 9) = ( (*in) >> 16 ) ; - ++in; - DSI; - DST(10) = ( (*in) >> 0 ) % (1U << 16 ) ; - DSI; - DST(11) = ( (*in) >> 16 ) ; - ++in; - DSI; - DST(12) = ( (*in) >> 0 ) % (1U << 16 ) ; - DSI; - DST(13) = ( (*in) >> 16 ) ; - ++in; - DSI; - DST(14) = ( (*in) >> 0 ) % (1U << 16 ) ; - DSI; - DST(15) = ( (*in) >> 16 ) ; - ++in; - DSI; - DST(16) = ( (*in) >> 0 ) % (1U << 16 ) ; - DSI; - DST(17) = ( (*in) >> 16 ) ; - ++in; - DSI; - DST(18) = ( (*in) >> 0 ) % (1U << 16 ) ; - DSI; - DST(19) = ( (*in) >> 16 ) ; - ++in; - DSI; - DST(20) = ( (*in) >> 0 ) % (1U << 16 ) ; - DSI; - DST(21) = ( (*in) >> 16 ) ; - ++in; - DSI; - DST(22) = ( (*in) >> 0 ) % (1U << 16 ) ; - DSI; - DST(23) = ( (*in) >> 16 ) ; - ++in; - DSI; - DST(24) = ( (*in) >> 0 ) % (1U << 16 ) ; - DSI; - DST(25) = ( (*in) >> 16 ) ; - ++in; - DSI; - DST(26) = ( (*in) >> 0 ) % (1U << 16 ) ; - DSI; - DST(27) = ( (*in) >> 16 ) ; - ++in; - DSI; - DST(28) = ( (*in) >> 0 ) % (1U << 16 ) ; - DSI; - DST(29) = ( (*in) >> 16 ) ; - ++in; - DSI; - DST(30) = ( (*in) >> 0 ) % (1U << 16 ) ; - DSI; - DST(31) = ( (*in) >> 16 ) ; - ++in; - DSI; - - return in; - } - - - - -const uint32_t * __fastunpack17_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - DST( 0) = ( (*in) >> 0 ) % (1U << 17 ) ; - DSI; - DST( 1) = ( (*in) >> 17 ) ; - ++in; - DST( 1) |= ((*in) % (1U<< 2 ))<<( 17 - 2 ); - DSI; - DST( 2) = ( (*in) >> 2 ) % (1U << 17 ) ; - DSI; - DST( 3) = ( (*in) >> 19 ) ; - ++in; - DST( 3) |= ((*in) % (1U<< 4 ))<<( 17 - 4 ); - DSI; - DST( 4) = ( (*in) >> 4 ) % (1U << 17 ) ; - DSI; - DST( 5) = ( (*in) >> 21 ) ; - ++in; - DST( 5) |= ((*in) % (1U<< 6 ))<<( 17 - 6 ); - DSI; - DST( 6) = ( (*in) >> 6 ) % (1U << 17 ) ; - DSI; - DST( 7) = ( (*in) >> 23 ) ; - ++in; - DST( 7) |= ((*in) % (1U<< 8 ))<<( 17 - 8 ); - DSI; - DST( 8) = ( (*in) >> 8 ) % (1U << 17 ) ; - DSI; - DST( 9) = ( (*in) >> 25 ) ; - ++in; - DST( 9) |= ((*in) % (1U<< 10 ))<<( 17 - 10 ); - DSI; - DST(10) = ( (*in) >> 10 ) % (1U << 17 ) ; - DSI; - DST(11) = ( (*in) >> 27 ) ; - ++in; - DST(11) |= ((*in) % (1U<< 12 ))<<( 17 - 12 ); - DSI; - DST(12) = ( (*in) >> 12 ) % (1U << 17 ) ; - DSI; - DST(13) = ( (*in) >> 29 ) ; - ++in; - DST(13) |= ((*in) % (1U<< 14 ))<<( 17 - 14 ); - DSI; - DST(14) = ( (*in) >> 14 ) % (1U << 17 ) ; - DSI; - DST(15) = ( (*in) >> 31 ) ; - ++in; - DST(15) |= ((*in) % (1U<< 16 ))<<( 17 - 16 ); - DSI; - DST(16) = ( (*in) >> 16 ) ; - ++in; - DST(16) |= ((*in) % (1U<< 1 ))<<( 17 - 1 ); - DSI; - DST(17) = ( (*in) >> 1 ) % (1U << 17 ) ; - DSI; - DST(18) = ( (*in) >> 18 ) ; - ++in; - DST(18) |= ((*in) % (1U<< 3 ))<<( 17 - 3 ); - DSI; - DST(19) = ( (*in) >> 3 ) % (1U << 17 ) ; - DSI; - DST(20) = ( (*in) >> 20 ) ; - ++in; - DST(20) |= ((*in) % (1U<< 5 ))<<( 17 - 5 ); - DSI; - DST(21) = ( (*in) >> 5 ) % (1U << 17 ) ; - DSI; - DST(22) = ( (*in) >> 22 ) ; - ++in; - DST(22) |= ((*in) % (1U<< 7 ))<<( 17 - 7 ); - DSI; - DST(23) = ( (*in) >> 7 ) % (1U << 17 ) ; - DSI; - DST(24) = ( (*in) >> 24 ) ; - ++in; - DST(24) |= ((*in) % (1U<< 9 ))<<( 17 - 9 ); - DSI; - DST(25) = ( (*in) >> 9 ) % (1U << 17 ) ; - DSI; - DST(26) = ( (*in) >> 26 ) ; - ++in; - DST(26) |= ((*in) % (1U<< 11 ))<<( 17 - 11 ); - DSI; - DST(27) = ( (*in) >> 11 ) % (1U << 17 ) ; - DSI; - DST(28) = ( (*in) >> 28 ) ; - ++in; - DST(28) |= ((*in) % (1U<< 13 ))<<( 17 - 13 ); - DSI; - DST(29) = ( (*in) >> 13 ) % (1U << 17 ) ; - DSI; - DST(30) = ( (*in) >> 30 ) ; - ++in; - DST(30) |= ((*in) % (1U<< 15 ))<<( 17 - 15 ); - DSI; - DST(31) = ( (*in) >> 15 ) ; - ++in; - DSI; - - return in; - } - - - - -const uint32_t * __fastunpack18_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 ); - out++; - *out = ( (*in) >> 12 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 18 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 18 - 6 ); - out++; - *out = ( (*in) >> 6 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 18 - 10 ); - out++; - *out = ( (*in) >> 10 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 18 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 ); - out++; - *out = ( (*in) >> 12 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 18 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 18 - 6 ); - out++; - *out = ( (*in) >> 6 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 18 - 10 ); - out++; - *out = ( (*in) >> 10 ) % (1U << 18 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 18 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack19_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 19 ) ; - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 19 - 6 ); - out++; - *out = ( (*in) >> 6 ) % (1U << 19 ) ; - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 19 - 12 ); - out++; - *out = ( (*in) >> 12 ) % (1U << 19 ) ; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 19 - 18 ); - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 19 - 5 ); - out++; - *out = ( (*in) >> 5 ) % (1U << 19 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 11 ))<<( 19 - 11 ); - out++; - *out = ( (*in) >> 11 ) % (1U << 19 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 17 ))<<( 19 - 17 ); - out++; - *out = ( (*in) >> 17 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 19 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 19 ) ; - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 19 - 10 ); - out++; - *out = ( (*in) >> 10 ) % (1U << 19 ) ; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 19 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 19 - 3 ); - out++; - *out = ( (*in) >> 3 ) % (1U << 19 ) ; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 9 ))<<( 19 - 9 ); - out++; - *out = ( (*in) >> 9 ) % (1U << 19 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 15 ))<<( 19 - 15 ); - out++; - *out = ( (*in) >> 15 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 19 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 19 ) ; - out++; - *out = ( (*in) >> 21 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 19 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 19 ) ; - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 19 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 19 - 1 ); - out++; - *out = ( (*in) >> 1 ) % (1U << 19 ) ; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 7 ))<<( 19 - 7 ); - out++; - *out = ( (*in) >> 7 ) % (1U << 19 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 13 ))<<( 19 - 13 ); - out++; - *out = ( (*in) >> 13 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack20_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 20 ) ; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 20 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 20 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 20 ) ; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 20 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 20 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 20 ) ; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 20 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 20 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 20 ) ; - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 20 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 20 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack21_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 21 ) ; - out++; - *out = ( (*in) >> 21 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 21 - 10 ); - out++; - *out = ( (*in) >> 10 ) % (1U << 21 ) ; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 21 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 9 ))<<( 21 - 9 ); - out++; - *out = ( (*in) >> 9 ) % (1U << 21 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 19 ))<<( 21 - 19 ); - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 21 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 21 ) ; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 21 - 18 ); - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 7 ))<<( 21 - 7 ); - out++; - *out = ( (*in) >> 7 ) % (1U << 21 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 17 ))<<( 21 - 17 ); - out++; - *out = ( (*in) >> 17 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 21 - 6 ); - out++; - *out = ( (*in) >> 6 ) % (1U << 21 ) ; - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 21 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 21 - 5 ); - out++; - *out = ( (*in) >> 5 ) % (1U << 21 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 15 ))<<( 21 - 15 ); - out++; - *out = ( (*in) >> 15 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 21 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 21 ) ; - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 21 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 21 - 3 ); - out++; - *out = ( (*in) >> 3 ) % (1U << 21 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 13 ))<<( 21 - 13 ); - out++; - *out = ( (*in) >> 13 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 21 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 21 ) ; - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 21 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 21 - 1 ); - out++; - *out = ( (*in) >> 1 ) % (1U << 21 ) ; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 11 ))<<( 21 - 11 ); - out++; - *out = ( (*in) >> 11 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack22_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 22 ) ; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 22 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 22 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 22 - 6 ); - out++; - *out = ( (*in) >> 6 ) % (1U << 22 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 22 - 18 ); - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 22 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 22 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 22 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 22 - 10 ); - out++; - *out = ( (*in) >> 10 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 22 ) ; - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 22 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 22 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 22 - 6 ); - out++; - *out = ( (*in) >> 6 ) % (1U << 22 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 22 - 18 ); - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 22 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 22 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 22 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 22 - 10 ); - out++; - *out = ( (*in) >> 10 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack23_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 23 ) ; - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 23 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 23 - 5 ); - out++; - *out = ( (*in) >> 5 ) % (1U << 23 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 19 ))<<( 23 - 19 ); - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 23 - 10 ); - out++; - *out = ( (*in) >> 10 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 23 - 1 ); - out++; - *out = ( (*in) >> 1 ) % (1U << 23 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 15 ))<<( 23 - 15 ); - out++; - *out = ( (*in) >> 15 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 23 - 6 ); - out++; - *out = ( (*in) >> 6 ) % (1U << 23 ) ; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 23 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 11 ))<<( 23 - 11 ); - out++; - *out = ( (*in) >> 11 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 23 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 23 ) ; - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 23 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 7 ))<<( 23 - 7 ); - out++; - *out = ( (*in) >> 7 ) % (1U << 23 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 21 ))<<( 23 - 21 ); - out++; - *out = ( (*in) >> 21 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 23 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 23 - 3 ); - out++; - *out = ( (*in) >> 3 ) % (1U << 23 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 17 ))<<( 23 - 17 ); - out++; - *out = ( (*in) >> 17 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 23 - 8 ); - out++; - *out = ( (*in) >> 8 ) % (1U << 23 ) ; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 23 - 22 ); - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 13 ))<<( 23 - 13 ); - out++; - *out = ( (*in) >> 13 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 23 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 23 ) ; - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 23 - 18 ); - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 9 ))<<( 23 - 9 ); - out++; - *out = ( (*in) >> 9 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack24_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 24 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 24 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 24 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 24 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 24 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 24 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 24 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 24 ) ; - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack25_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 25 ) ; - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 25 - 18 ); - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 11 ))<<( 25 - 11 ); - out++; - *out = ( (*in) >> 11 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 25 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 25 ) ; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 25 - 22 ); - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 15 ))<<( 25 - 15 ); - out++; - *out = ( (*in) >> 15 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 25 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 25 - 1 ); - out++; - *out = ( (*in) >> 1 ) % (1U << 25 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 19 ))<<( 25 - 19 ); - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 25 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 25 - 5 ); - out++; - *out = ( (*in) >> 5 ) % (1U << 25 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 23 ))<<( 25 - 23 ); - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 25 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 9 ))<<( 25 - 9 ); - out++; - *out = ( (*in) >> 9 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 25 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 25 ) ; - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 25 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 13 ))<<( 25 - 13 ); - out++; - *out = ( (*in) >> 13 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 25 - 6 ); - out++; - *out = ( (*in) >> 6 ) % (1U << 25 ) ; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 25 - 24 ); - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 17 ))<<( 25 - 17 ); - out++; - *out = ( (*in) >> 17 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 25 - 10 ); - out++; - *out = ( (*in) >> 10 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 25 - 3 ); - out++; - *out = ( (*in) >> 3 ) % (1U << 25 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 21 ))<<( 25 - 21 ); - out++; - *out = ( (*in) >> 21 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 25 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 7 ))<<( 25 - 7 ); - out++; - *out = ( (*in) >> 7 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack26_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 26 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 26 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 ); - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 26 - 10 ); - out++; - *out = ( (*in) >> 10 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 26 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 26 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 26 - 24 ); - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 26 - 18 ); - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 26 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 26 - 6 ); - out++; - *out = ( (*in) >> 6 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 26 ) ; - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 26 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 ); - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 26 - 10 ); - out++; - *out = ( (*in) >> 10 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 26 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 26 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 26 - 24 ); - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 26 - 18 ); - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 26 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 26 - 6 ); - out++; - *out = ( (*in) >> 6 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack27_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 27 ) ; - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 27 - 22 ); - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 17 ))<<( 27 - 17 ); - out++; - *out = ( (*in) >> 17 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 27 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 7 ))<<( 27 - 7 ); - out++; - *out = ( (*in) >> 7 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 27 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 27 ) ; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 27 - 24 ); - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 19 ))<<( 27 - 19 ); - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 27 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 9 ))<<( 27 - 9 ); - out++; - *out = ( (*in) >> 9 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 27 - 4 ); - out++; - *out = ( (*in) >> 4 ) % (1U << 27 ) ; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 26 ))<<( 27 - 26 ); - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 21 ))<<( 27 - 21 ); - out++; - *out = ( (*in) >> 21 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 27 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 11 ))<<( 27 - 11 ); - out++; - *out = ( (*in) >> 11 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 27 - 6 ); - out++; - *out = ( (*in) >> 6 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 27 - 1 ); - out++; - *out = ( (*in) >> 1 ) % (1U << 27 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 23 ))<<( 27 - 23 ); - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 27 - 18 ); - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 13 ))<<( 27 - 13 ); - out++; - *out = ( (*in) >> 13 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 27 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 27 - 3 ); - out++; - *out = ( (*in) >> 3 ) % (1U << 27 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 25 ))<<( 27 - 25 ); - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 27 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 15 ))<<( 27 - 15 ); - out++; - *out = ( (*in) >> 15 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 27 - 10 ); - out++; - *out = ( (*in) >> 10 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 27 - 5 ); - out++; - *out = ( (*in) >> 5 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack28_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 28 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); - out++; - *out = ( (*in) >> 4 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 28 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); - out++; - *out = ( (*in) >> 4 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 28 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); - out++; - *out = ( (*in) >> 4 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 28 ) ; - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); - out++; - *out = ( (*in) >> 4 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack29_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 29 ) ; - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 26 ))<<( 29 - 26 ); - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 23 ))<<( 29 - 23 ); - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 29 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 17 ))<<( 29 - 17 ); - out++; - *out = ( (*in) >> 17 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 29 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 11 ))<<( 29 - 11 ); - out++; - *out = ( (*in) >> 11 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 29 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 29 - 5 ); - out++; - *out = ( (*in) >> 5 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 29 - 2 ); - out++; - *out = ( (*in) >> 2 ) % (1U << 29 ) ; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 28 ))<<( 29 - 28 ); - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 25 ))<<( 29 - 25 ); - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 29 - 22 ); - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 19 ))<<( 29 - 19 ); - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 29 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 13 ))<<( 29 - 13 ); - out++; - *out = ( (*in) >> 13 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 29 - 10 ); - out++; - *out = ( (*in) >> 10 ) ; - ++in; - *out |= ((*in) % (1U<< 7 ))<<( 29 - 7 ); - out++; - *out = ( (*in) >> 7 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 29 - 4 ); - out++; - *out = ( (*in) >> 4 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 29 - 1 ); - out++; - *out = ( (*in) >> 1 ) % (1U << 29 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 27 ))<<( 29 - 27 ); - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 29 - 24 ); - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 21 ))<<( 29 - 21 ); - out++; - *out = ( (*in) >> 21 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 29 - 18 ); - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 15 ))<<( 29 - 15 ); - out++; - *out = ( (*in) >> 15 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 29 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 9 ))<<( 29 - 9 ); - out++; - *out = ( (*in) >> 9 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 29 - 6 ); - out++; - *out = ( (*in) >> 6 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 29 - 3 ); - out++; - *out = ( (*in) >> 3 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack30_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 30 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 ); - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 ); - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 ); - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 ); - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 ); - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 30 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 30 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 30 - 10 ); - out++; - *out = ( (*in) >> 10 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 30 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 30 - 6 ); - out++; - *out = ( (*in) >> 6 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 30 - 4 ); - out++; - *out = ( (*in) >> 4 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 30 - 2 ); - out++; - *out = ( (*in) >> 2 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) % (1U << 30 ) ; - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 ); - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 ); - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 ); - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 ); - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 ); - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 30 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 30 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 30 - 10 ); - out++; - *out = ( (*in) >> 10 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 30 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 30 - 6 ); - out++; - *out = ( (*in) >> 6 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 30 - 4 ); - out++; - *out = ( (*in) >> 4 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 30 - 2 ); - out++; - *out = ( (*in) >> 2 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack31_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) % (1U << 31 ) ; - out++; - *out = ( (*in) >> 31 ) ; - ++in; - *out |= ((*in) % (1U<< 30 ))<<( 31 - 30 ); - out++; - *out = ( (*in) >> 30 ) ; - ++in; - *out |= ((*in) % (1U<< 29 ))<<( 31 - 29 ); - out++; - *out = ( (*in) >> 29 ) ; - ++in; - *out |= ((*in) % (1U<< 28 ))<<( 31 - 28 ); - out++; - *out = ( (*in) >> 28 ) ; - ++in; - *out |= ((*in) % (1U<< 27 ))<<( 31 - 27 ); - out++; - *out = ( (*in) >> 27 ) ; - ++in; - *out |= ((*in) % (1U<< 26 ))<<( 31 - 26 ); - out++; - *out = ( (*in) >> 26 ) ; - ++in; - *out |= ((*in) % (1U<< 25 ))<<( 31 - 25 ); - out++; - *out = ( (*in) >> 25 ) ; - ++in; - *out |= ((*in) % (1U<< 24 ))<<( 31 - 24 ); - out++; - *out = ( (*in) >> 24 ) ; - ++in; - *out |= ((*in) % (1U<< 23 ))<<( 31 - 23 ); - out++; - *out = ( (*in) >> 23 ) ; - ++in; - *out |= ((*in) % (1U<< 22 ))<<( 31 - 22 ); - out++; - *out = ( (*in) >> 22 ) ; - ++in; - *out |= ((*in) % (1U<< 21 ))<<( 31 - 21 ); - out++; - *out = ( (*in) >> 21 ) ; - ++in; - *out |= ((*in) % (1U<< 20 ))<<( 31 - 20 ); - out++; - *out = ( (*in) >> 20 ) ; - ++in; - *out |= ((*in) % (1U<< 19 ))<<( 31 - 19 ); - out++; - *out = ( (*in) >> 19 ) ; - ++in; - *out |= ((*in) % (1U<< 18 ))<<( 31 - 18 ); - out++; - *out = ( (*in) >> 18 ) ; - ++in; - *out |= ((*in) % (1U<< 17 ))<<( 31 - 17 ); - out++; - *out = ( (*in) >> 17 ) ; - ++in; - *out |= ((*in) % (1U<< 16 ))<<( 31 - 16 ); - out++; - *out = ( (*in) >> 16 ) ; - ++in; - *out |= ((*in) % (1U<< 15 ))<<( 31 - 15 ); - out++; - *out = ( (*in) >> 15 ) ; - ++in; - *out |= ((*in) % (1U<< 14 ))<<( 31 - 14 ); - out++; - *out = ( (*in) >> 14 ) ; - ++in; - *out |= ((*in) % (1U<< 13 ))<<( 31 - 13 ); - out++; - *out = ( (*in) >> 13 ) ; - ++in; - *out |= ((*in) % (1U<< 12 ))<<( 31 - 12 ); - out++; - *out = ( (*in) >> 12 ) ; - ++in; - *out |= ((*in) % (1U<< 11 ))<<( 31 - 11 ); - out++; - *out = ( (*in) >> 11 ) ; - ++in; - *out |= ((*in) % (1U<< 10 ))<<( 31 - 10 ); - out++; - *out = ( (*in) >> 10 ) ; - ++in; - *out |= ((*in) % (1U<< 9 ))<<( 31 - 9 ); - out++; - *out = ( (*in) >> 9 ) ; - ++in; - *out |= ((*in) % (1U<< 8 ))<<( 31 - 8 ); - out++; - *out = ( (*in) >> 8 ) ; - ++in; - *out |= ((*in) % (1U<< 7 ))<<( 31 - 7 ); - out++; - *out = ( (*in) >> 7 ) ; - ++in; - *out |= ((*in) % (1U<< 6 ))<<( 31 - 6 ); - out++; - *out = ( (*in) >> 6 ) ; - ++in; - *out |= ((*in) % (1U<< 5 ))<<( 31 - 5 ); - out++; - *out = ( (*in) >> 5 ) ; - ++in; - *out |= ((*in) % (1U<< 4 ))<<( 31 - 4 ); - out++; - *out = ( (*in) >> 4 ) ; - ++in; - *out |= ((*in) % (1U<< 3 ))<<( 31 - 3 ); - out++; - *out = ( (*in) >> 3 ) ; - ++in; - *out |= ((*in) % (1U<< 2 ))<<( 31 - 2 ); - out++; - *out = ( (*in) >> 2 ) ; - ++in; - *out |= ((*in) % (1U<< 1 ))<<( 31 - 1 ); - out++; - *out = ( (*in) >> 1 ) ; - ++in; - out++; - - return in; - } - - - - -const uint32_t * __fastunpack32_32(const uint32_t * __restrict in, uint32_t * __restrict out) { - - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - *out = ( (*in) >> 0 ) ; - ++in; - out++; - - return in; - } - - - - const uint32_t * fastunpack_32(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit) { - switch(bit) { - case 0: return nullunpacker32(in,out); - - case 1: - return __fastunpack1_32(in,out); - - case 2: - return __fastunpack2_32(in,out); - - case 3: - return __fastunpack3_32(in,out); - - case 4: - return __fastunpack4_32(in,out); - - case 5: - return __fastunpack5_32(in,out); - - case 6: - return __fastunpack6_32(in,out); - - case 7: - return __fastunpack7_32(in,out); - - case 8: - return __fastunpack8_32(in,out); - - case 9: - return __fastunpack9_32(in,out); - - case 10: - return __fastunpack10_32(in,out); - - case 11: - return __fastunpack11_32(in,out); - - case 12: - return __fastunpack12_32(in,out); - - case 13: - return __fastunpack13_32(in,out); - - case 14: - return __fastunpack14_32(in,out); - - case 15: - return __fastunpack15_32(in,out); - - case 16: - return __fastunpack16_32(in,out); - - case 17: - return __fastunpack17_32(in,out); - - case 18: - return __fastunpack18_32(in,out); - - case 19: - return __fastunpack19_32(in,out); - - case 20: - return __fastunpack20_32(in,out); - - case 21: - return __fastunpack21_32(in,out); - - case 22: - return __fastunpack22_32(in,out); - - case 23: - return __fastunpack23_32(in,out); - - case 24: - return __fastunpack24_32(in,out); - - case 25: - return __fastunpack25_32(in,out); - - case 26: - return __fastunpack26_32(in,out); - - case 27: - return __fastunpack27_32(in,out); - - case 28: - return __fastunpack28_32(in,out); - - case 29: - return __fastunpack29_32(in,out); - - case 30: - return __fastunpack30_32(in,out); - - case 31: - return __fastunpack31_32(in,out); - - case 32: - return __fastunpack32_32(in,out); - - default: - break; - } - //throw logic_error("number of bits is unsupported"); - } - - - - /*assumes that integers fit in the prescribed number of bits*/ - uint32_t * fastpackwithoutmask_32(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit) { - switch(bit) { - case 0: return nullpacker(in,out); - - case 1: - return __fastpackwithoutmask1_32(in,out); - - case 2: - return __fastpackwithoutmask2_32(in,out); - - case 3: - return __fastpackwithoutmask3_32(in,out); - - case 4: - return __fastpackwithoutmask4_32(in,out); - - case 5: - return __fastpackwithoutmask5_32(in,out); - - case 6: - return __fastpackwithoutmask6_32(in,out); - - case 7: - return __fastpackwithoutmask7_32(in,out); - - case 8: - return __fastpackwithoutmask8_32(in,out); - - case 9: - return __fastpackwithoutmask9_32(in,out); - - case 10: - return __fastpackwithoutmask10_32(in,out); - - case 11: - return __fastpackwithoutmask11_32(in,out); - - case 12: - return __fastpackwithoutmask12_32(in,out); - - case 13: - return __fastpackwithoutmask13_32(in,out); - - case 14: - return __fastpackwithoutmask14_32(in,out); - - case 15: - return __fastpackwithoutmask15_32(in,out); - - case 16: - return __fastpackwithoutmask16_32(in,out); - - case 17: - return __fastpackwithoutmask17_32(in,out); - - case 18: - return __fastpackwithoutmask18_32(in,out); - - case 19: - return __fastpackwithoutmask19_32(in,out); - - case 20: - return __fastpackwithoutmask20_32(in,out); - - case 21: - return __fastpackwithoutmask21_32(in,out); - - case 22: - return __fastpackwithoutmask22_32(in,out); - - case 23: - return __fastpackwithoutmask23_32(in,out); - - case 24: - return __fastpackwithoutmask24_32(in,out); - - case 25: - return __fastpackwithoutmask25_32(in,out); - - case 26: - return __fastpackwithoutmask26_32(in,out); - - case 27: - return __fastpackwithoutmask27_32(in,out); - - case 28: - return __fastpackwithoutmask28_32(in,out); - - case 29: - return __fastpackwithoutmask29_32(in,out); - - case 30: - return __fastpackwithoutmask30_32(in,out); - - case 31: - return __fastpackwithoutmask31_32(in,out); - - case 32: - return __fastpackwithoutmask32_32(in,out); - - default: - break; - } - //throw logic_error("number of bits is unsupported"); - } diff --git a/ext/simdcomp/bitpacka.h b/ext/simdcomp/bitpacka.h deleted file mode 100644 index 6fa76c8..0000000 --- a/ext/simdcomp/bitpacka.h +++ /dev/null @@ -1,28 +0,0 @@ -/** - * This code is released under the - * Apache License Version 2.0 http://www.apache.org/licenses/. - * - * (c) Daniel Lemire, http://lemire.me/en/ - */ -#ifndef BITPACKINGALIGNED -#define BITPACKINGALIGNED -#include -#include -#include - -const uint32_t * fastunpack_8(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit); -uint32_t * fastpackwithoutmask_8(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit); - -const uint32_t * fastunpack_16(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit); -uint32_t * fastpackwithoutmask_16(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit); - -const uint32_t * fastunpack_24(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit); -uint32_t * fastpackwithoutmask_24(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit); - -const uint32_t * fastunpack_32(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit); - -uint32_t * fastpackwithoutmask_32(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit); - - - -#endif // BITPACKINGALIGNED diff --git a/ext/simdcomp/example.c b/ext/simdcomp/example.c deleted file mode 100644 index 0394e20..0000000 --- a/ext/simdcomp/example.c +++ /dev/null @@ -1,66 +0,0 @@ -#include -#include -#include "simdcomp.h" - - -// compresses data from datain to buffer, returns how many bytes written -size_t compress(uint32_t * datain, size_t length, uint8_t * buffer) { - if(length/SIMDBlockSize*SIMDBlockSize != length) { - printf("Data length should be a multiple of %i \n",SIMDBlockSize); - } - uint32_t offset = 0; - uint8_t * initout = buffer; - for(size_t k = 0; k < length / SIMDBlockSize; ++k) { - uint32_t b = simdmaxbitsd1(offset, - datain + k * SIMDBlockSize); - *buffer++ = b; - simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, (__m128i *) buffer, - b); - offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1]; - buffer += b * sizeof(__m128i); - } - return buffer - initout; -} - - -int main() { - int REPEAT = 5; - int N = 1000000 * SIMDBlockSize;//SIMDBlockSize is 128 - uint32_t * datain = malloc(N * sizeof(uint32_t)); - size_t compsize; - clock_t start, end; - - uint8_t * buffer = malloc(N * sizeof(uint32_t) + N / SIMDBlockSize); // output buffer - uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t)); - for (int gap = 1; gap <= 243; gap *= 3) { - printf("\n"); - printf(" gap = %u \n", gap); - for (int k = 0; k < N; ++k) - datain[k] = k * gap; - uint32_t offset = 0; - compsize = compress(datain,N,buffer); - printf("compression rate = %f \n", (N * sizeof(uint32_t))/ (compsize * 1.0 )); - start = clock(); - uint32_t bogus = 0; - for(int repeat = 0; repeat < REPEAT; ++repeat) { - uint8_t * decbuffer = buffer; - for (int k = 0; k * SIMDBlockSize < N; ++k) { - uint8_t b = *decbuffer++; - simdunpackd1(offset, (__m128i *) decbuffer, backbuffer, b); - // do something here with backbuffer - bogus += backbuffer[3]; - decbuffer += b * sizeof(__m128i); - offset = backbuffer[SIMDBlockSize - 1]; - } - } - end = clock(); - double numberofseconds = (end-start)/(double)CLOCKS_PER_SEC; - printf("decoding speed in million of integers per second %f \n",N*REPEAT/(numberofseconds*1000.0*1000.0)); - printf("ignore me %i \n",bogus); - } - free(buffer); - free(datain); - free(backbuffer); - return 0; -} - diff --git a/ext/simdcomp/include/simdbitpacking.h b/ext/simdcomp/include/simdbitpacking.h deleted file mode 100644 index 301f4f5..0000000 --- a/ext/simdcomp/include/simdbitpacking.h +++ /dev/null @@ -1,21 +0,0 @@ -/** - * This code is released under a BSD License. - */ -#ifndef SIMDBITPACKING_H_ -#define SIMDBITPACKING_H_ - -#include // SSE2 is required -#include // use a C99-compliant compiler, please -#include // for memset - -//reads 128 values from "in", writes "bit" 128-bit vectors to "out" -void simdpack(const uint32_t * in,__m128i * out, uint32_t bit); - -//reads 128 values from "in", writes "bit" 128-bit vectors to "out" -void simdpackwithoutmask(const uint32_t * in,__m128i * out, uint32_t bit); - -//reads "bit" 128-bit vectors from "in", writes 128 values to "out" -void simdunpack(const __m128i * in,uint32_t * out, uint32_t bit); - - -#endif /* SIMDBITPACKING_H_ */ diff --git a/ext/simdcomp/include/simdcomp.h b/ext/simdcomp/include/simdcomp.h deleted file mode 100644 index 8875f0f..0000000 --- a/ext/simdcomp/include/simdcomp.h +++ /dev/null @@ -1,12 +0,0 @@ -/** - * This code is released under a BSD License. - */ - -#ifndef SIMDCOMP_H_ -#define SIMDCOMP_H_ - -#include "simdbitpacking.h" -#include "simdcomputil.h" -#include "simdintegratedbitpacking.h" - -#endif diff --git a/ext/simdcomp/include/simdcomputil.h b/ext/simdcomp/include/simdcomputil.h deleted file mode 100644 index 107665b..0000000 --- a/ext/simdcomp/include/simdcomputil.h +++ /dev/null @@ -1,29 +0,0 @@ -/** - * This code is released under a BSD License. - */ - -#ifndef SIMDCOMPUTIL_H_ -#define SIMDCOMPUTIL_H_ - -#include // SSE2 is required -#include // use a C99-compliant compiler, please - - - - -// returns the integer logarithm of v (bit width) -uint32_t bits(const uint32_t v); - -// max integer logarithm over a range of SIMDBlockSize integers (128 integer) -uint32_t maxbits(const uint32_t * begin); - -enum{ SIMDBlockSize = 128}; - -// like maxbit over 128 integers (SIMDBlockSize) with provided initial value -// and using differential coding -uint32_t simdmaxbitsd1(uint32_t initvalue, const uint32_t * in); - - - - -#endif /* SIMDCOMPUTIL_H_ */ diff --git a/ext/simdcomp/include/simdintegratedbitpacking.h b/ext/simdcomp/include/simdintegratedbitpacking.h deleted file mode 100644 index 18ca795..0000000 --- a/ext/simdcomp/include/simdintegratedbitpacking.h +++ /dev/null @@ -1,27 +0,0 @@ -/** - * This code is released under a BSD License. - */ - -#ifndef SIMD_INTEGRATED_BITPACKING_H -#define SIMD_INTEGRATED_BITPACKING_H - -#include // SSE2 is required -#include // use a C99-compliant compiler, please - -#include "simdcomputil.h" - -//reads 128 values from "in", writes "bit" 128-bit vectors to "out" -// integer values should be in sorted order (for best results) -void simdpackd1(uint32_t initvalue, const uint32_t * in,__m128i * out, uint32_t bit); - - -//reads 128 values from "in", writes "bit" 128-bit vectors to "out" -// integer values should be in sorted order (for best results) -void simdpackwithoutmaskd1(uint32_t initvalue, const uint32_t * in,__m128i * out, uint32_t bit); - - -//reads "bit" 128-bit vectors from "in", writes 128 values to "out" -void simdunpackd1(uint32_t initvalue, const __m128i * in,uint32_t * out, uint32_t bit); - - -#endif diff --git a/ext/simdcomp/makefile b/ext/simdcomp/makefile deleted file mode 100644 index 6ebd9d9..0000000 --- a/ext/simdcomp/makefile +++ /dev/null @@ -1,54 +0,0 @@ -# minimalist makefile -.SUFFIXES: -# -.SUFFIXES: .cpp .o .c .h - -CFLAGS = -fPIC -std=c99 -O3 -Wall -Wextra -Wno-unused-parameter -pedantic -LDFLAGS = -shared -LIBNAME=libsimdcomp.so.0.0.3 -all: unit $(LIBNAME) -test: - ./unit -install: $(OBJECTS) - cp $(LIBNAME) /usr/local/lib - ln -s /usr/local/lib/$(LIBNAME) /usr/local/lib/libsimdcomp.so - ldconfig - cp $(HEADERS) /usr/local/include - - - -HEADERS=./include/simdbitpacking.h ./include/simdcomputil.h ./include/simdintegratedbitpacking.h ./include/simdcomp.h - -uninstall: - for h in $(HEADERS) ; do rm /usr/local/$$h; done - rm /usr/local/lib/$(LIBNAME) - rm /usr/local/lib/libsimdcomp.so - ldconfig - - -OBJECTS= simdbitpacking.o simdintegratedbitpacking.o simdcomputil.o - -$(LIBNAME): $(OBJECTS) - $(CC) $(CFLAGS) -o $(LIBNAME) $(OBJECTS) $(LDFLAGS) - - - -simdcomputil.o: ./src/simdcomputil.c $(HEADERS) - $(CC) $(CFLAGS) -c ./src/simdcomputil.c -Iinclude - -simdbitpacking.o: ./src/simdbitpacking.c $(HEADERS) - $(CC) $(CFLAGS) -c ./src/simdbitpacking.c -Iinclude - -simdintegratedbitpacking.o: ./src/simdintegratedbitpacking.c $(HEADERS) - $(CC) $(CFLAGS) -c ./src/simdintegratedbitpacking.c -Iinclude - -example: ./example.c $(HEADERS) $(OBJECTS) - $(CC) $(CFLAGS) -o example ./example.c -Iinclude $(OBJECTS) - -unit: ./src/unit.c $(HEADERS) $(OBJECTS) - $(CC) $(CFLAGS) -o unit ./src/unit.c -Iinclude $(OBJECTS) -dynunit: ./src/unit.c $(HEADERS) $(LIBNAME) - $(CC) $(CFLAGS) -o dynunit ./src/unit.c -Iinclude -lsimdcomp - -clean: - rm -f unit *.o $(LIBNAME) diff --git a/ext/simdcomp/src/simdbitpacking.c b/ext/simdcomp/src/simdbitpacking.c deleted file mode 100644 index 7137682..0000000 --- a/ext/simdcomp/src/simdbitpacking.c +++ /dev/null @@ -1,14009 +0,0 @@ -/** - * This code is released under a BSD License. - */ -#include "../include/simdbitpacking.h" - - -static void SIMD_nullunpacker32(const __m128i * _in , uint32_t * out) { - (void) _in; - memset(out,0,32 * 4 * 4); -} - -static void __SIMD_fastpackwithoutmask1_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask2_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask3_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 3 - 1); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 3 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask5_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 3); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 1); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask6_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask7_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 3); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 5); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 1); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask9_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 3); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 7); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 1); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 5); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask10_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask11_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 1); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 3); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 5); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 7); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 9); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask12_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask13_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 7); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 1); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 9); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 3); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 11); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 5); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask14_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask15_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 13); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 11); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 9); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 7); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 5); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 3); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 1); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask17_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 1); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 3); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 5); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 7); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 9); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 11); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 13); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 15); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask18_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask19_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 5); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 11); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 17); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 3); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 9); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 15); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 1); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 7); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 13); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask20_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask21_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 9); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 19); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 7); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 17); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 5); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 15); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 3); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 13); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 1); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 11); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask22_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask23_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 5); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 19); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 1); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 15); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 11); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 7); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 21); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 3); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 17); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 22); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 13); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 9); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask24_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask25_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 11); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 22); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 15); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 1); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 19); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 5); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 23); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 9); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 13); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 17); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 3); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 21); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 7); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask26_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 22); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 22); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask27_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 22); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 17); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 7); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 19); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 9); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 26); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 21); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 11); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 1); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 23); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 13); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 3); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 25); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 15); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 5); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask28_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask29_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 26); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 23); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 17); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 11); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 5); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 28); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 25); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 22); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 19); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 13); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 7); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 1); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 27); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 21); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 15); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 9); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 3); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask30_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 28); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 26); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 22); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 28); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 26); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 22); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask31_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 30); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 29); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 28); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 27); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 26); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 25); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 24); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 23); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 22); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 21); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 20); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 19); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 18); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 17); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 16); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 15); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 14); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 13); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 12); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 11); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 10); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 9); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 8); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 7); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 6); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 5); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 4); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 3); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 2); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 1); - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask32_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpackwithoutmask4_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg; - uint32_t outer; - for(outer=0; outer< 4 ;++outer) { - InReg = _mm_loadu_si128(in); - OutReg = InReg; - - InReg = _mm_loadu_si128(in+1); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - - InReg = _mm_loadu_si128(in+2); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - - InReg = _mm_loadu_si128(in+3); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - - InReg = _mm_loadu_si128(in+4); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - - InReg = _mm_loadu_si128(in+5); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - - InReg = _mm_loadu_si128(in+6); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - - InReg = _mm_loadu_si128(in+7); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - - in+=8; - } - -} - - - -static void __SIMD_fastpackwithoutmask8_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg; - uint32_t outer; - for(outer=0; outer< 8 ;++outer) { - InReg = _mm_loadu_si128(in); - OutReg = InReg; - - InReg = _mm_loadu_si128(in+1); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - - InReg = _mm_loadu_si128(in+2); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - - InReg = _mm_loadu_si128(in+3); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - - in+=4; - } - -} - - - -static void __SIMD_fastpackwithoutmask16_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - __m128i InReg; - uint32_t outer; - for(outer=0; outer< 16 ;++outer) { - InReg = _mm_loadu_si128(in); - OutReg = InReg; - - InReg = _mm_loadu_si128(in+1); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - - in+=2; - } - -} - - - -static void __SIMD_fastpack1_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<1)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack2_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<2)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack3_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<3)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 3 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 3 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack5_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<5)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack6_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<6)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack7_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<7)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack9_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<9)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack10_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<10)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack11_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<11)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 9); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack12_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<12)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack13_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<13)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 9); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 11); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack14_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<14)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack15_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<15)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 13); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 11); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 9); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack17_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<17)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 9); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 11); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 13); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 15); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack18_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<18)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack19_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<19)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 11); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 17); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 9); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 15); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 13); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack20_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<20)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack21_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<21)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 9); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 19); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 17); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 15); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 13); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 11); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack22_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<22)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack23_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<23)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 19); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 15); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 11); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 21); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 17); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 22); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 13); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 9); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack24_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<24)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack25_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<25)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 11); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 22); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 15); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 19); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 23); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 9); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 13); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 17); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 21); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack26_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<26)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 22); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 22); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack27_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<27)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 22); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 17); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 19); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 9); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 26); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 21); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 11); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 23); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 13); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 25); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 15); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack28_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<28)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack29_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<29)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 26); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 23); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 17); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 11); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 28); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 25); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 22); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 19); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 13); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 27); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 21); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 15); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 9); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack30_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<30)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 28); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 26); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 22); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 28); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 26); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 22); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack31_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32((1U<<31)-1); - - __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 30); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 29); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 28); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 27); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 26); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 25); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 24); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 23); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 22); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 21); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 20); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 19); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 18); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 17); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 16); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 15); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 14); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 13); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 12); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 11); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 10); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 9); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 8); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 7); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 6); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 5); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 4); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 3); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 2); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - _mm_storeu_si128(out, OutReg); - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 1); - InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack32_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - __m128i InReg = _mm_loadu_si128(in); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - ++out; - InReg = _mm_loadu_si128(++in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - -} - - - -static void __SIMD_fastpack4_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg, InReg; - const __m128i mask = _mm_set1_epi32((1U<<4)-1); - - uint32_t outer; - for(outer=0; outer< 4 ;++outer) { - InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - - InReg = _mm_and_si128(_mm_loadu_si128(in+1), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - - InReg = _mm_and_si128(_mm_loadu_si128(in+2), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - - InReg = _mm_and_si128(_mm_loadu_si128(in+3), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - - InReg = _mm_and_si128(_mm_loadu_si128(in+4), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - - InReg = _mm_and_si128(_mm_loadu_si128(in+5), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - - InReg = _mm_and_si128(_mm_loadu_si128(in+6), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - - InReg = _mm_and_si128(_mm_loadu_si128(in+7), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - ++out; - - in+=8; - } - -} - - - -static void __SIMD_fastpack8_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg, InReg; - const __m128i mask = _mm_set1_epi32((1U<<8)-1); - - uint32_t outer; - for(outer=0; outer< 8 ;++outer) { - InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - - InReg = _mm_and_si128(_mm_loadu_si128(in+1), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - - InReg = _mm_and_si128(_mm_loadu_si128(in+2), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - - InReg = _mm_and_si128(_mm_loadu_si128(in+3), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - ++out; - - in+=4; - } - -} - - - -static void __SIMD_fastpack16_32(const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg, InReg; - const __m128i mask = _mm_set1_epi32((1U<<16)-1); - - uint32_t outer; - for(outer=0; outer< 16 ;++outer) { - InReg = _mm_and_si128(_mm_loadu_si128(in), mask); - OutReg = InReg; - - InReg = _mm_and_si128(_mm_loadu_si128(in+1), mask); - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - ++out; - - in+=2; - } - -} - - - - -static void __SIMD_fastunpack1_32(const __m128i* in, uint32_t * _out) { - __m128i* out = (__m128i*)(_out); - __m128i InReg1 = _mm_loadu_si128(in); - __m128i InReg2 = InReg1; - __m128i OutReg1, OutReg2, OutReg3, OutReg4; - const __m128i mask = _mm_set1_epi32(1); - - unsigned shift = 0; - unsigned i; - for (i = 0; i < 8; ++i) { - OutReg1 = _mm_and_si128( _mm_srli_epi32(InReg1,shift++) , mask); - OutReg2 = _mm_and_si128( _mm_srli_epi32(InReg2,shift++) , mask); - OutReg3 = _mm_and_si128( _mm_srli_epi32(InReg1,shift++) , mask); - OutReg4 = _mm_and_si128( _mm_srli_epi32(InReg2,shift++) , mask); - _mm_storeu_si128(out++, OutReg1); - _mm_storeu_si128(out++, OutReg2); - _mm_storeu_si128(out++, OutReg3); - _mm_storeu_si128(out++, OutReg4); - } -} - - - - -static void __SIMD_fastunpack2_32(const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<2)-1); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,28) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,28) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,30) ; - _mm_storeu_si128(out++, OutReg); - - -} - - - - -static void __SIMD_fastunpack3_32(const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<3)-1); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,27) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-1), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,25) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,28) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,31) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-2), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,23) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,29) ; - _mm_storeu_si128(out++, OutReg); - - -} - - - - -static void __SIMD_fastunpack4_32(const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<4)-1); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - _mm_storeu_si128(out++, OutReg); - - -} - - - - -static void __SIMD_fastunpack5_32(const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<5)-1); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,25) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-3), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,23) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-1), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,31) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,29) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-2), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,27) ; - _mm_storeu_si128(out++, OutReg); - - -} - - - - -static void __SIMD_fastunpack6_32(const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<6)-1); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,26) ; - _mm_storeu_si128(out++, OutReg); - - -} - - - - -static void __SIMD_fastunpack7_32(const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<7)-1); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-3), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,31) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-6), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,27) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-2), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,23) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-5), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-1), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,29) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,25) ; - _mm_storeu_si128(out++, OutReg); - - -} - - - - -static void __SIMD_fastunpack8_32(const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<8)-1); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - _mm_storeu_si128(out++, OutReg); - - -} - - - - -static void __SIMD_fastunpack9_32(const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<9)-1); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,27) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,31) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-3), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-7), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,25) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-2), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,29) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-6), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-1), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-5), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,23) ; - _mm_storeu_si128(out++, OutReg); - - -} - - - - -static void __SIMD_fastunpack10_32(const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<10)-1); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,22) ; - _mm_storeu_si128(out++, OutReg); - - -} - - - - -static void __SIMD_fastunpack11_32(const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<11)-1); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-1), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,23) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-2), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-3), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,25) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-5), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,27) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-6), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-7), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,29) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-9), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,31) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-10), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,21) ; - _mm_storeu_si128(out++, OutReg); - - -} - - - - -static void __SIMD_fastunpack12_32(const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<12)-1); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,20) ; - _mm_storeu_si128(out++, OutReg); - - -} - - - - -static void __SIMD_fastunpack13_32(const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<13)-1); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-7), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-1), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,27) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,21) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-2), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-9), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-3), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,29) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-10), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,23) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-11), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-5), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,31) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-12), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,25) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-6), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,19) ; - _mm_storeu_si128(out++, OutReg); - - -} - - - - -static void __SIMD_fastunpack14_32(const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<14)-1); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,18) ; - _mm_storeu_si128(out++, OutReg); - - -} - - - - -static void __SIMD_fastunpack15_32(const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<15)-1); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-13), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-11), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-9), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-7), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-5), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-3), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-1), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,31) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-14), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,29) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-12), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,27) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-10), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,25) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,23) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-6), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,21) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,19) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-2), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,17) ; - _mm_storeu_si128(out++, OutReg); - - -} - - - - -static void __SIMD_fastunpack16_32(const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<16)-1); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - _mm_storeu_si128(out++, OutReg); - - -} - - - - -static void __SIMD_fastunpack17_32(const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<17)-1); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,17) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-2), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,19) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,21) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-6), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,23) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,25) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-10), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,27) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-12), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,29) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-14), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,31) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-16), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-1), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-3), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-5), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-7), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-9), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-11), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-13), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-15), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,15) ; - _mm_storeu_si128(out++, OutReg); - - -} - - - - -static void __SIMD_fastunpack18_32(const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<18)-1); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,14) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,14) ; - _mm_storeu_si128(out++, OutReg); - - -} - - - - -static void __SIMD_fastunpack19_32(const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<19)-1); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,19) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-6), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,25) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-12), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,31) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-18), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-5), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-11), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-17), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,17) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,23) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-10), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,29) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-16), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-3), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-9), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-15), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,15) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-2), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,21) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,27) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-14), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,14) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-1), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-7), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-13), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,13) ; - _mm_storeu_si128(out++, OutReg); - - -} - - - - -static void __SIMD_fastunpack20_32(const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<20)-1); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,12) ; - _mm_storeu_si128(out++, OutReg); - - -} - - - - -static void __SIMD_fastunpack21_32(const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<21)-1); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,21) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-10), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,31) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-20), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-9), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-19), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,19) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,29) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-18), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-7), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-17), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,17) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-6), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,27) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-16), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-5), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-15), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,15) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,25) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-14), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,14) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-3), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-13), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,13) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-2), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,23) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-12), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-1), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-11), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,11) ; - _mm_storeu_si128(out++, OutReg); - - -} - - - - -static void __SIMD_fastunpack22_32(const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<22)-1); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,14) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,10) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,14) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,10) ; - _mm_storeu_si128(out++, OutReg); - - -} - - - - -static void __SIMD_fastunpack23_32(const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<23)-1); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,23) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-14), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,14) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-5), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-19), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,19) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-10), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,10) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-1), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-15), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,15) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-6), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,29) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-20), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-11), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,11) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-2), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,25) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-16), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-7), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-21), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,21) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-12), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-3), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-17), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,17) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,31) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-22), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-13), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,13) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,27) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-18), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-9), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,9) ; - _mm_storeu_si128(out++, OutReg); - - -} - - - - -static void __SIMD_fastunpack24_32(const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<24)-1); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,8) ; - _mm_storeu_si128(out++, OutReg); - - -} - - - - -static void __SIMD_fastunpack25_32(const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<25)-1); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,25) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-18), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-11), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,11) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,29) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-22), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-15), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,15) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-1), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-19), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,19) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-12), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-5), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-23), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,23) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-16), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-9), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,9) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-2), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,27) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-20), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-13), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,13) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-6), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,31) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-24), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-17), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,17) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-10), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,10) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-3), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-21), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,21) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-14), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,14) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-7), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,7) ; - _mm_storeu_si128(out++, OutReg); - - -} - - - - -static void __SIMD_fastunpack26_32(const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<26)-1); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,14) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,10) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,6) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,14) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,10) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,6) ; - _mm_storeu_si128(out++, OutReg); - - -} - - - - -static void __SIMD_fastunpack27_32(const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<27)-1); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,27) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-22), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-17), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,17) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-12), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-7), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,7) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-2), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,29) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-24), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-19), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,19) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-14), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,14) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-9), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,9) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,31) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-26), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-21), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,21) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-16), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-11), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,11) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-6), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,6) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-1), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-23), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,23) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-18), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-13), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,13) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-3), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-25), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,25) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-20), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-15), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,15) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-10), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,10) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-5), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,5) ; - _mm_storeu_si128(out++, OutReg); - - -} - - - - -static void __SIMD_fastunpack28_32(const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<28)-1); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,4) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,4) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,4) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,4) ; - _mm_storeu_si128(out++, OutReg); - - -} - - - - -static void __SIMD_fastunpack29_32(const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<29)-1); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,29) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-26), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-23), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,23) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-20), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-17), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,17) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-14), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,14) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-11), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,11) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-5), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,5) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-2), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,31) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-28), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-25), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,25) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-22), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-19), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,19) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-16), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-13), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,13) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-10), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,10) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-7), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,7) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,4) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-1), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-27), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,27) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-24), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-21), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,21) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-18), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-15), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,15) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-12), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-9), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,9) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-6), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,6) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-3), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,3) ; - _mm_storeu_si128(out++, OutReg); - - -} - - - - -static void __SIMD_fastunpack30_32(const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<30)-1); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,14) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,10) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,6) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,4) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,2) ; - InReg = _mm_loadu_si128(++in); - - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,14) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,10) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,6) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,4) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,2) ; - _mm_storeu_si128(out++, OutReg); - - -} - - - - -static void __SIMD_fastunpack31_32(const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - const __m128i mask = _mm_set1_epi32((1U<<31)-1); - - OutReg = _mm_and_si128( InReg , mask); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,31) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-30), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,30) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-29), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,29) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-28), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,28) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-27), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,27) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-26), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,26) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-25), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,25) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-24), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,24) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-23), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,23) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-22), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,22) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-21), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,21) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-20), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,20) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-19), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,19) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-18), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,18) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-17), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,17) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-16), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,16) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-15), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,15) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-14), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,14) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-13), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,13) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-12), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,12) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-11), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,11) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-10), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,10) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-9), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,9) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-8), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,8) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-7), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,7) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-6), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,6) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-5), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,5) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-4), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,4) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-3), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,3) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-2), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,2) ; - InReg = _mm_loadu_si128(++in); - - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-1), mask)); - _mm_storeu_si128(out++, OutReg); - - OutReg = _mm_srli_epi32(InReg,1) ; - _mm_storeu_si128(out++, OutReg); - - -} - - -void __SIMD_fastunpack32_32(const __m128i* in, uint32_t * _out) { - __m128i* out = (__m128i*)(_out); - uint32_t outer; - for(outer=0; outer< 32 ;++outer) { - _mm_storeu_si128(out++, _mm_loadu_si128(in++)); - } -} - - - -void simdunpack(const __m128i * in, uint32_t * out, const uint32_t bit) { - switch(bit) { - case 0: SIMD_nullunpacker32(in,out); return; - - case 1: __SIMD_fastunpack1_32(in,out); return; - - case 2: __SIMD_fastunpack2_32(in,out); return; - - case 3: __SIMD_fastunpack3_32(in,out); return; - - case 4: __SIMD_fastunpack4_32(in,out); return; - - case 5: __SIMD_fastunpack5_32(in,out); return; - - case 6: __SIMD_fastunpack6_32(in,out); return; - - case 7: __SIMD_fastunpack7_32(in,out); return; - - case 8: __SIMD_fastunpack8_32(in,out); return; - - case 9: __SIMD_fastunpack9_32(in,out); return; - - case 10: __SIMD_fastunpack10_32(in,out); return; - - case 11: __SIMD_fastunpack11_32(in,out); return; - - case 12: __SIMD_fastunpack12_32(in,out); return; - - case 13: __SIMD_fastunpack13_32(in,out); return; - - case 14: __SIMD_fastunpack14_32(in,out); return; - - case 15: __SIMD_fastunpack15_32(in,out); return; - - case 16: __SIMD_fastunpack16_32(in,out); return; - - case 17: __SIMD_fastunpack17_32(in,out); return; - - case 18: __SIMD_fastunpack18_32(in,out); return; - - case 19: __SIMD_fastunpack19_32(in,out); return; - - case 20: __SIMD_fastunpack20_32(in,out); return; - - case 21: __SIMD_fastunpack21_32(in,out); return; - - case 22: __SIMD_fastunpack22_32(in,out); return; - - case 23: __SIMD_fastunpack23_32(in,out); return; - - case 24: __SIMD_fastunpack24_32(in,out); return; - - case 25: __SIMD_fastunpack25_32(in,out); return; - - case 26: __SIMD_fastunpack26_32(in,out); return; - - case 27: __SIMD_fastunpack27_32(in,out); return; - - case 28: __SIMD_fastunpack28_32(in,out); return; - - case 29: __SIMD_fastunpack29_32(in,out); return; - - case 30: __SIMD_fastunpack30_32(in,out); return; - - case 31: __SIMD_fastunpack31_32(in,out); return; - - case 32: __SIMD_fastunpack32_32(in,out); return; - - default: break; - } -} - - - - /*assumes that integers fit in the prescribed number of bits*/ -void simdpackwithoutmask(const uint32_t * in, __m128i * out, const uint32_t bit) { - switch(bit) { - case 0: return; - - case 1: __SIMD_fastpackwithoutmask1_32(in,out); return; - - case 2: __SIMD_fastpackwithoutmask2_32(in,out); return; - - case 3: __SIMD_fastpackwithoutmask3_32(in,out); return; - - case 4: __SIMD_fastpackwithoutmask4_32(in,out); return; - - case 5: __SIMD_fastpackwithoutmask5_32(in,out); return; - - case 6: __SIMD_fastpackwithoutmask6_32(in,out); return; - - case 7: __SIMD_fastpackwithoutmask7_32(in,out); return; - - case 8: __SIMD_fastpackwithoutmask8_32(in,out); return; - - case 9: __SIMD_fastpackwithoutmask9_32(in,out); return; - - case 10: __SIMD_fastpackwithoutmask10_32(in,out); return; - - case 11: __SIMD_fastpackwithoutmask11_32(in,out); return; - - case 12: __SIMD_fastpackwithoutmask12_32(in,out); return; - - case 13: __SIMD_fastpackwithoutmask13_32(in,out); return; - - case 14: __SIMD_fastpackwithoutmask14_32(in,out); return; - - case 15: __SIMD_fastpackwithoutmask15_32(in,out); return; - - case 16: __SIMD_fastpackwithoutmask16_32(in,out); return; - - case 17: __SIMD_fastpackwithoutmask17_32(in,out); return; - - case 18: __SIMD_fastpackwithoutmask18_32(in,out); return; - - case 19: __SIMD_fastpackwithoutmask19_32(in,out); return; - - case 20: __SIMD_fastpackwithoutmask20_32(in,out); return; - - case 21: __SIMD_fastpackwithoutmask21_32(in,out); return; - - case 22: __SIMD_fastpackwithoutmask22_32(in,out); return; - - case 23: __SIMD_fastpackwithoutmask23_32(in,out); return; - - case 24: __SIMD_fastpackwithoutmask24_32(in,out); return; - - case 25: __SIMD_fastpackwithoutmask25_32(in,out); return; - - case 26: __SIMD_fastpackwithoutmask26_32(in,out); return; - - case 27: __SIMD_fastpackwithoutmask27_32(in,out); return; - - case 28: __SIMD_fastpackwithoutmask28_32(in,out); return; - - case 29: __SIMD_fastpackwithoutmask29_32(in,out); return; - - case 30: __SIMD_fastpackwithoutmask30_32(in,out); return; - - case 31: __SIMD_fastpackwithoutmask31_32(in,out); return; - - case 32: __SIMD_fastpackwithoutmask32_32(in,out); return; - - default: break; - } -} - - - - /*assumes that integers fit in the prescribed number of bits*/ -void simdpack(const uint32_t * in, __m128i * out, const uint32_t bit) { - switch(bit) { - case 0: return; - - case 1: __SIMD_fastpack1_32(in,out); return; - - case 2: __SIMD_fastpack2_32(in,out); return; - - case 3: __SIMD_fastpack3_32(in,out); return; - - case 4: __SIMD_fastpack4_32(in,out); return; - - case 5: __SIMD_fastpack5_32(in,out); return; - - case 6: __SIMD_fastpack6_32(in,out); return; - - case 7: __SIMD_fastpack7_32(in,out); return; - - case 8: __SIMD_fastpack8_32(in,out); return; - - case 9: __SIMD_fastpack9_32(in,out); return; - - case 10: __SIMD_fastpack10_32(in,out); return; - - case 11: __SIMD_fastpack11_32(in,out); return; - - case 12: __SIMD_fastpack12_32(in,out); return; - - case 13: __SIMD_fastpack13_32(in,out); return; - - case 14: __SIMD_fastpack14_32(in,out); return; - - case 15: __SIMD_fastpack15_32(in,out); return; - - case 16: __SIMD_fastpack16_32(in,out); return; - - case 17: __SIMD_fastpack17_32(in,out); return; - - case 18: __SIMD_fastpack18_32(in,out); return; - - case 19: __SIMD_fastpack19_32(in,out); return; - - case 20: __SIMD_fastpack20_32(in,out); return; - - case 21: __SIMD_fastpack21_32(in,out); return; - - case 22: __SIMD_fastpack22_32(in,out); return; - - case 23: __SIMD_fastpack23_32(in,out); return; - - case 24: __SIMD_fastpack24_32(in,out); return; - - case 25: __SIMD_fastpack25_32(in,out); return; - - case 26: __SIMD_fastpack26_32(in,out); return; - - case 27: __SIMD_fastpack27_32(in,out); return; - - case 28: __SIMD_fastpack28_32(in,out); return; - - case 29: __SIMD_fastpack29_32(in,out); return; - - case 30: __SIMD_fastpack30_32(in,out); return; - - case 31: __SIMD_fastpack31_32(in,out); return; - - case 32: __SIMD_fastpack32_32(in,out); return; - - default: break; - } -} - - - diff --git a/ext/simdcomp/src/simdcomputil.c b/ext/simdcomp/src/simdcomputil.c deleted file mode 100644 index 9b36da5..0000000 --- a/ext/simdcomp/src/simdcomputil.c +++ /dev/null @@ -1,56 +0,0 @@ -#include "../include/simdcomputil.h" - -__attribute__((always_inline)) -static inline __m128i Delta(__m128i curr, __m128i prev) { - return _mm_sub_epi32(curr, - _mm_or_si128(_mm_slli_si128(curr, 4), _mm_srli_si128(prev, 12))); -} - - -// returns the integer logarithm of v (bit width) -uint32_t bits(const uint32_t v) { -#ifdef _MSC_VER - if (v == 0) { - return 0; - } - unsigned long answer; - _BitScanReverse(&answer, v); - return answer + 1; -#else - return v == 0 ? 0 : 32 - __builtin_clz(v); // assume GCC-like compiler if not microsoft -#endif -} - -__attribute__ ((pure)) -uint32_t maxbits(const uint32_t * begin) { - uint32_t accumulator = 0;const uint32_t * k; - for (k = begin; k != begin + SIMDBlockSize; ++k) { - accumulator |= *k; - } - return bits(accumulator); -} - -static uint32_t maxbitas32int(const __m128i accumulator) { - uint32_t tmparray[4]; - _mm_storeu_si128((__m128i *) (tmparray), accumulator); - return bits(tmparray[0] | tmparray[1] | tmparray[2] | tmparray[3]); -} - - -// maxbit over 128 integers (SIMDBlockSize) with provided initial value -uint32_t simdmaxbitsd1(uint32_t initvalue, const uint32_t * in) { - __m128i initoffset = _mm_set1_epi32 (initvalue); - const __m128i* pin = (const __m128i*)(in); - __m128i newvec = _mm_loadu_si128(pin); - __m128i accumulator = Delta(newvec , initoffset); - __m128i oldvec = newvec; - uint32_t k; - for(k = 1; 4*k < SIMDBlockSize; ++k) { - newvec = _mm_loadu_si128(pin+k); - accumulator = _mm_or_si128(accumulator,Delta(newvec , oldvec)); - oldvec = newvec; - } - initoffset = oldvec; - return maxbitas32int(accumulator); -} - diff --git a/ext/simdcomp/src/simdintegratedbitpacking.c b/ext/simdcomp/src/simdintegratedbitpacking.c deleted file mode 100644 index 951bb85..0000000 --- a/ext/simdcomp/src/simdintegratedbitpacking.c +++ /dev/null @@ -1,24872 +0,0 @@ -/** - * This code is released under a BSD License. - */ -#include "../include/simdintegratedbitpacking.h" - -__attribute__((always_inline)) -static inline __m128i Delta(__m128i curr, __m128i prev) { - return _mm_sub_epi32(curr, - _mm_or_si128(_mm_slli_si128(curr, 4), _mm_srli_si128(prev, 12))); -} - -__attribute__((always_inline)) -static inline __m128i PrefixSum(__m128i curr, __m128i prev) { - const __m128i _tmp1 = _mm_add_epi32(_mm_slli_si128(curr, 8), curr); - const __m128i _tmp2 = _mm_add_epi32(_mm_slli_si128(_tmp1, 4), _tmp1); - return _mm_add_epi32(_tmp2, _mm_shuffle_epi32(prev, 0xff)); -} - - -__m128i iunpack0(__m128i initOffset, const __m128i * _in , uint32_t * _out) { - (void) _in; - __m128i *out = (__m128i*)(_out); - const __m128i zero = _mm_set1_epi32 (0); - unsigned i; - for (i = 0; i < 8; ++i) { - initOffset = PrefixSum(zero, initOffset); - _mm_storeu_si128(out++, initOffset); - initOffset = PrefixSum(zero, initOffset); - _mm_storeu_si128(out++, initOffset); - initOffset = PrefixSum(zero, initOffset); - _mm_storeu_si128(out++, initOffset); - initOffset = PrefixSum(zero, initOffset); - _mm_storeu_si128(out++, initOffset); - } - - return initOffset; -} - - - - -void ipackwithoutmask0(__m128i initOffset , const uint32_t * _in , __m128i * out) { - (void) initOffset; - (void) _in; - (void) out; -} - - -void ipack0(__m128i initOffset , const uint32_t * _in , __m128i * out ) { - (void) initOffset; - (void) _in; - (void) out; -} - - - -void ipackwithoutmask1(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipack1(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(1U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipackwithoutmask2(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipack2(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(3U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipackwithoutmask3(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 3 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 3 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipack3(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(7U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 3 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 3 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipackwithoutmask4(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipack4(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(15U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipackwithoutmask5(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipack5(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(31U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 5 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipackwithoutmask6(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipack6(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(63U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 6 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipackwithoutmask7(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipack7(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(127U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 7 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipackwithoutmask8(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipack8(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(255U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipackwithoutmask9(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipack9(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(511U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 9 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipackwithoutmask10(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipack10(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(1023U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 10 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipackwithoutmask11(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipack11(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(2047U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 11 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipackwithoutmask12(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipack12(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(4095U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 12 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipackwithoutmask13(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipack13(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(8191U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 13 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipackwithoutmask14(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipack14(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(16383U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 14 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipackwithoutmask15(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipack15(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(32767U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 15 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipackwithoutmask16(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipack16(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(65535U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipackwithoutmask17(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipack17(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(131071U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 17 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipackwithoutmask18(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipack18(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(262143U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 18 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipackwithoutmask19(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 17); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipack19(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(524287U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 17); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 19 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipackwithoutmask20(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipack20(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(1048575U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 20 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipackwithoutmask21(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 19); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 17); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipack21(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(2097151U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 19); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 17); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 21 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipackwithoutmask22(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipack22(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(4194303U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 22 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipackwithoutmask23(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 19); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 21); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 17); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipack23(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(8388607U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 19); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 21); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 17); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 23 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipackwithoutmask24(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipack24(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(16777215U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 24 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipackwithoutmask25(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 19); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 23); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 17); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 21); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipack25(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(33554431U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 19); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 23); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 17); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 21); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 25 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipackwithoutmask26(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipack26(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(67108863U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 26 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipackwithoutmask27(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 17); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 19); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 26); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 21); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 23); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 25); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipack27(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(134217727U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 17); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 19); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 26); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 21); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 23); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 25); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 27 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipackwithoutmask28(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipack28(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(268435455U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 28 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipackwithoutmask29(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 26); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 23); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 17); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 28); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 25); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 19); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 27); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 21); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipack29(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(536870911U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 26); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 23); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 17); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 28); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 25); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 19); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 27); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 21); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 29 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipackwithoutmask30(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 28); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 26); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 28); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 26); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipack30(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(1073741823U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 28); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 26); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 28); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 26); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 30 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipackwithoutmask31(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 30); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 29); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 28); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 27); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 26); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 25); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 23); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 21); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 19); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 17); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = Delta(CurrIn, initOffset); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipack31(__m128i initOffset, const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - __m128i OutReg; - - - const __m128i mask = _mm_set1_epi32(2147483647U); ; - - __m128i CurrIn = _mm_loadu_si128(in); - __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - OutReg = InReg; - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 30); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 29); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 28); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 27); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 26); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 25); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 24); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 23); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 22); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 21); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 20); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 19); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 18); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 17); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 16); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 15); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 14); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 13); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 12); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 11); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 10); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 9); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 8); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 7); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 6); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 5); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 4); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 3); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 2); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); - _mm_storeu_si128(out, OutReg); - - ++out; - OutReg = _mm_srli_epi32(InReg, 31 - 1); - ++in; - CurrIn = _mm_loadu_si128(in); - InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); - initOffset = CurrIn; - - OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipackwithoutmask32(__m128i initOffset , const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - (void) initOffset; - __m128i OutReg; - - - __m128i InReg = _mm_loadu_si128(in); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - -} - - - - -void ipack32(__m128i initOffset , const uint32_t * _in, __m128i * out) { - const __m128i *in = (const __m128i*)(_in); - (void) initOffset; - __m128i OutReg; - - - - __m128i InReg = _mm_loadu_si128(in); - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - ++out; - ++in; - InReg = _mm_loadu_si128(in); - - OutReg = InReg; - _mm_storeu_si128(out, OutReg); - - -} - - - - - -__m128i iunpack1(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<1)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack2(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<2)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack3(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<3)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-1), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack4(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<4)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack5(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<5)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-3), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-1), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack6(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<6)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack7(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<7)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-3), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-5), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-1), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack8(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<8)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack9(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<9)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-3), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-7), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-1), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-5), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack10(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<10)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack11(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<11)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-1), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-3), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-5), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-7), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-9), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack12(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<12)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack13(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<13)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-7), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-1), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-9), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-3), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-11), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-5), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack14(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<14)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack15(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<15)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-13), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-11), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-9), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-7), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-5), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-3), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-1), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-14), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack16(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<16)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack17(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<17)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-14), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-1), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-3), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-5), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-7), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-9), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-11), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-13), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-15), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack18(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<18)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack19(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<19)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-18), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-5), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-11), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-17), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-3), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-9), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-15), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-14), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-1), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-7), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-13), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack20(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<20)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack21(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<21)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-20), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-9), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-19), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-18), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-7), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-17), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-5), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-15), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-14), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-3), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-13), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-1), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-11), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack22(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<22)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack23(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<23)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-14), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-5), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-19), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-1), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-15), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-20), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-11), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-7), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-21), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-3), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-17), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-22), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-13), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-18), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-9), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack24(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<24)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack25(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<25)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-18), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-11), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-22), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-15), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-1), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-19), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-5), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-23), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-9), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-20), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-13), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-24), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-17), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-3), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-21), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-14), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-7), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack26(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<26)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack27(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<27)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-22), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-17), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-7), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-24), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-19), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-14), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-9), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-26), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-21), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-11), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-1), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-23), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-18), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-13), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-3), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-25), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-20), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-15), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-5), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack28(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<28)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack29(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<29)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-26), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-23), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-20), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-17), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-14), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-11), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-5), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-28), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-25), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-22), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-19), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-13), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-7), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-1), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-27), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-24), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-21), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-18), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-15), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-9), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-3), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack30(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<30)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - - -__m128i iunpack31(__m128i initOffset, const __m128i* in, uint32_t * _out) { - - __m128i* out = (__m128i*)(_out); - __m128i InReg = _mm_loadu_si128(in); - __m128i OutReg; - __m128i tmp; - __m128i mask = _mm_set1_epi32((1U<<31)-1); - - - - tmp = InReg; - OutReg = _mm_and_si128(tmp, mask); - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,31); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-30), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,30); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-29), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,29); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-28), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,28); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-27), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,27); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-26), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,26); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-25), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,25); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-24), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,24); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-23), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,23); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-22), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,22); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-21), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,21); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-20), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,20); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-19), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,19); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-18), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,18); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-17), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,17); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-16), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,16); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-15), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,15); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-14), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,14); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-13), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,13); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-12), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,12); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-11), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,11); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-10), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,10); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-9), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,9); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-8), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,8); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-7), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,7); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-6), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,6); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-5), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,5); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-4), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,4); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-3), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,3); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-2), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,2); - OutReg = tmp; - ++in; InReg = _mm_loadu_si128(in); - OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-1), mask)); - - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - tmp = _mm_srli_epi32(InReg,1); - OutReg = tmp; - OutReg = PrefixSum(OutReg, initOffset); - initOffset = OutReg; - _mm_storeu_si128(out++, OutReg); - - - return initOffset; - -} - - - - -__m128i iunpack32(__m128i initOffset, const __m128i* in, uint32_t * _out) { - (void) initOffset; - __m128i * mout = (__m128i *)(_out); - __m128i invec; - size_t k; - for(k = 0; k < 128/4; ++k) { - invec = _mm_loadu_si128(in++); - _mm_storeu_si128(mout++, invec); - } - return invec; -} - - - - - void simdunpackd1(uint32_t initvalue, const __m128i * in, uint32_t * out, const uint32_t bit) { - __m128i initOffset = _mm_set1_epi32 (initvalue); - switch(bit) { - case 0: iunpack0(initOffset,in,out); break; - - case 1: iunpack1(initOffset,in,out); break; - - case 2: iunpack2(initOffset,in,out); break; - - case 3: iunpack3(initOffset,in,out); break; - - case 4: iunpack4(initOffset,in,out); break; - - case 5: iunpack5(initOffset,in,out); break; - - case 6: iunpack6(initOffset,in,out); break; - - case 7: iunpack7(initOffset,in,out); break; - - case 8: iunpack8(initOffset,in,out); break; - - case 9: iunpack9(initOffset,in,out); break; - - case 10: iunpack10(initOffset,in,out); break; - - case 11: iunpack11(initOffset,in,out); break; - - case 12: iunpack12(initOffset,in,out); break; - - case 13: iunpack13(initOffset,in,out); break; - - case 14: iunpack14(initOffset,in,out); break; - - case 15: iunpack15(initOffset,in,out); break; - - case 16: iunpack16(initOffset,in,out); break; - - case 17: iunpack17(initOffset,in,out); break; - - case 18: iunpack18(initOffset,in,out); break; - - case 19: iunpack19(initOffset,in,out); break; - - case 20: iunpack20(initOffset,in,out); break; - - case 21: iunpack21(initOffset,in,out); break; - - case 22: iunpack22(initOffset,in,out); break; - - case 23: iunpack23(initOffset,in,out); break; - - case 24: iunpack24(initOffset,in,out); break; - - case 25: iunpack25(initOffset,in,out); break; - - case 26: iunpack26(initOffset,in,out); break; - - case 27: iunpack27(initOffset,in,out); break; - - case 28: iunpack28(initOffset,in,out); break; - - case 29: iunpack29(initOffset,in,out); break; - - case 30: iunpack30(initOffset,in,out); break; - - case 31: iunpack31(initOffset,in,out); break; - - case 32: iunpack32(initOffset,in,out); break; - - default: break; - } -} - - - - /*assumes that integers fit in the prescribed number of bits*/ - -void simdpackwithoutmaskd1(uint32_t initvalue, const uint32_t * in, __m128i * out, const uint32_t bit) { - __m128i initOffset = _mm_set1_epi32 (initvalue); - switch(bit) { - case 0: break; - - case 1: ipackwithoutmask1(initOffset,in,out); break; - - case 2: ipackwithoutmask2(initOffset,in,out); break; - - case 3: ipackwithoutmask3(initOffset,in,out); break; - - case 4: ipackwithoutmask4(initOffset,in,out); break; - - case 5: ipackwithoutmask5(initOffset,in,out); break; - - case 6: ipackwithoutmask6(initOffset,in,out); break; - - case 7: ipackwithoutmask7(initOffset,in,out); break; - - case 8: ipackwithoutmask8(initOffset,in,out); break; - - case 9: ipackwithoutmask9(initOffset,in,out); break; - - case 10: ipackwithoutmask10(initOffset,in,out); break; - - case 11: ipackwithoutmask11(initOffset,in,out); break; - - case 12: ipackwithoutmask12(initOffset,in,out); break; - - case 13: ipackwithoutmask13(initOffset,in,out); break; - - case 14: ipackwithoutmask14(initOffset,in,out); break; - - case 15: ipackwithoutmask15(initOffset,in,out); break; - - case 16: ipackwithoutmask16(initOffset,in,out); break; - - case 17: ipackwithoutmask17(initOffset,in,out); break; - - case 18: ipackwithoutmask18(initOffset,in,out); break; - - case 19: ipackwithoutmask19(initOffset,in,out); break; - - case 20: ipackwithoutmask20(initOffset,in,out); break; - - case 21: ipackwithoutmask21(initOffset,in,out); break; - - case 22: ipackwithoutmask22(initOffset,in,out); break; - - case 23: ipackwithoutmask23(initOffset,in,out); break; - - case 24: ipackwithoutmask24(initOffset,in,out); break; - - case 25: ipackwithoutmask25(initOffset,in,out); break; - - case 26: ipackwithoutmask26(initOffset,in,out); break; - - case 27: ipackwithoutmask27(initOffset,in,out); break; - - case 28: ipackwithoutmask28(initOffset,in,out); break; - - case 29: ipackwithoutmask29(initOffset,in,out); break; - - case 30: ipackwithoutmask30(initOffset,in,out); break; - - case 31: ipackwithoutmask31(initOffset,in,out); break; - - case 32: ipackwithoutmask32(initOffset,in,out); break; - - default: break; - } -} - - - - -void simdpackd1(uint32_t initvalue, const uint32_t * in, __m128i * out, const uint32_t bit) { - __m128i initOffset = _mm_set1_epi32 (initvalue); - switch(bit) { - case 0: break;; - - case 1: ipack1(initOffset, in,out); break; - - case 2: ipack2(initOffset, in,out); break; - - case 3: ipack3(initOffset, in,out); break; - - case 4: ipack4(initOffset, in,out); break; - - case 5: ipack5(initOffset, in,out); break; - - case 6: ipack6(initOffset, in,out); break; - - case 7: ipack7(initOffset, in,out); break; - - case 8: ipack8(initOffset, in,out); break; - - case 9: ipack9(initOffset, in,out); break; - - case 10: ipack10(initOffset, in,out); break; - - case 11: ipack11(initOffset, in,out); break; - - case 12: ipack12(initOffset, in,out); break; - - case 13: ipack13(initOffset, in,out); break; - - case 14: ipack14(initOffset, in,out); break; - - case 15: ipack15(initOffset, in,out); break; - - case 16: ipack16(initOffset, in,out); break; - - case 17: ipack17(initOffset, in,out); break; - - case 18: ipack18(initOffset, in,out); break; - - case 19: ipack19(initOffset, in,out); break; - - case 20: ipack20(initOffset, in,out); break; - - case 21: ipack21(initOffset, in,out); break; - - case 22: ipack22(initOffset, in,out); break; - - case 23: ipack23(initOffset, in,out); break; - - case 24: ipack24(initOffset, in,out); break; - - case 25: ipack25(initOffset, in,out); break; - - case 26: ipack26(initOffset, in,out); break; - - case 27: ipack27(initOffset, in,out); break; - - case 28: ipack28(initOffset, in,out); break; - - case 29: ipack29(initOffset, in,out); break; - - case 30: ipack30(initOffset, in,out); break; - - case 31: ipack31(initOffset, in,out); break; - - case 32: ipack32(initOffset, in,out); break; - - default: break; - } -} - diff --git a/ext/simdcomp/src/unit.c b/ext/simdcomp/src/unit.c deleted file mode 100644 index 826f447..0000000 --- a/ext/simdcomp/src/unit.c +++ /dev/null @@ -1,63 +0,0 @@ -/** - * This code is released under a BSD License. - */ -#include -#include -#include "simdcomp.h" - - -int main() { - int N = 5000 * SIMDBlockSize; - __m128i * buffer = malloc(SIMDBlockSize * sizeof(uint32_t)); - uint32_t * datain = malloc(N * sizeof(uint32_t)); - uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t)); - for (int gap = 1; gap <= 387420489; gap *= 3) { - printf(" gap = %u \n", gap); - for (int k = 0; k < N; ++k) - datain[k] = k * gap; - uint32_t offset = 0; - for (int k = 0; k * SIMDBlockSize < N; ++k) { - ///////////////////////////// - // First part works for general arrays (sorted or unsorted) - ///////////////////////////// - // we compute the bit width - const uint32_t b = maxbits(datain + k * SIMDBlockSize); - // we read 128 integers at "datain + k * SIMDBlockSize" and - // write b 128-bit vectors at "buffer" - simdpackwithoutmask(datain + k * SIMDBlockSize, buffer, b); - // we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer - simdunpack(buffer, backbuffer, b);//uncompressed - for (int j = 0; j < SIMDBlockSize; ++j) { - if (backbuffer[j] != datain[k * SIMDBlockSize + j]) { - printf("bug in simdpack\n"); - return -2; - } - } - ///////////////////////////// - // next part assumes that the data is sorted (uses differential coding) - ///////////////////////////// - // we compute the bit width - const uint32_t b1 = simdmaxbitsd1(offset, - datain + k * SIMDBlockSize); - // we read 128 integers at "datain + k * SIMDBlockSize" and - // write b1 128-bit vectors at "buffer" - simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, buffer, - b1); - // we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer - simdunpackd1(offset, buffer, backbuffer, b1); - for (int j = 0; j < SIMDBlockSize; ++j) { - if (backbuffer[j] != datain[k * SIMDBlockSize + j]) { - printf("bug in simdpack d1\n"); - return -3; - } - } - offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1]; - - } - } - free(buffer); - free(datain); - free(backbuffer); - printf("Code looks good.\n"); - return 0; -} diff --git a/ext/simdcomp_/simdfor.c b/ext/simdcomp_/simdfor.c new file mode 100644 index 0000000..ed11ad4 --- /dev/null +++ b/ext/simdcomp_/simdfor.c @@ -0,0 +1,14501 @@ +/** + * This code is released under a BSD License. + */ + +#include "simdfor.h" + + + +static __m128i iunpackFOR0(__m128i initOffset, const __m128i * _in , uint32_t * _out) { + __m128i *out = (__m128i*)(_out); + int i; + (void) _in; + for (i = 0; i < 8; ++i) { + _mm_storeu_si128(out++, initOffset); + _mm_storeu_si128(out++, initOffset); + _mm_storeu_si128(out++, initOffset); + _mm_storeu_si128(out++, initOffset); + } + + return initOffset; +} + + + + +static void ipackFOR0(__m128i initOffset , const uint32_t * _in , __m128i * out ) { + (void) initOffset; + (void) _in; + (void) out; +} + + +static void ipackFOR1(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void ipackFOR2(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void ipackFOR3(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void ipackFOR4(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void ipackFOR5(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void ipackFOR6(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void ipackFOR7(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void ipackFOR8(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void ipackFOR9(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void ipackFOR10(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void ipackFOR11(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void ipackFOR12(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void ipackFOR13(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void ipackFOR14(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void ipackFOR15(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void ipackFOR16(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void ipackFOR17(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void ipackFOR18(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void ipackFOR19(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void ipackFOR20(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void ipackFOR21(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void ipackFOR22(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void ipackFOR23(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void ipackFOR24(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void ipackFOR25(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void ipackFOR26(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void ipackFOR27(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 25); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void ipackFOR28(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void ipackFOR29(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 25); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 27); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void ipackFOR30(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void ipackFOR31(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_sub_epi32(CurrIn, initOffset); + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 30); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 29); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 27); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 25); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_sub_epi32(CurrIn, initOffset); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void ipackFOR32(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + __m128i InReg = _mm_loadu_si128(in); + (void) initOffset; + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + +} + + + + +static __m128i iunpackFOR1(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<1)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR2(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<2)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR3(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<3)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR4(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<4)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR5(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<5)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR6(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<6)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR7(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<7)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR8(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<8)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR9(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<9)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR10(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<10)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR11(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<11)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR12(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<12)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR13(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<13)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR14(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<14)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR15(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<15)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR16(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<16)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR17(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<17)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-15), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR18(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<18)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR19(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<19)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-17), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-15), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR20(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<20)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR21(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<21)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-19), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-17), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-15), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR22(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<22)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR23(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<23)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-19), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-15), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-21), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-17), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR24(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<24)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR25(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<25)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-15), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-19), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-23), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-17), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-21), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR26(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<26)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR27(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<27)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-17), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-19), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-26), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-21), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-23), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-25), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-15), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR28(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<28)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR29(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<29)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-26), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-23), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-17), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-28), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-25), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-19), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-27), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-21), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-15), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR30(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<30)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + +static __m128i iunpackFOR31(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + const __m128i mask = _mm_set1_epi32((1U<<31)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-30), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-29), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-28), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-27), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-26), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-25), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-24), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-23), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-22), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-21), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-20), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-19), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-18), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-17), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-16), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-15), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-14), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-13), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-12), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-11), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-10), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-9), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-8), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-7), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-6), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-5), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-4), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-3), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-2), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-1), mask)); + + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = tmp; + OutReg = _mm_add_epi32(OutReg, initOffset); + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + +static __m128i iunpackFOR32(__m128i initvalue , const __m128i* in, uint32_t * _out) { + __m128i * mout = (__m128i *)_out; + __m128i invec; + size_t k; + (void) initvalue; + for(k = 0; k < 128/4; ++k) { + invec = _mm_loadu_si128(in++); + _mm_storeu_si128(mout++, invec); + } + return invec; +} + + + + + +void simdpackFOR(uint32_t initvalue, const uint32_t * in, __m128i * out, const uint32_t bit) { + __m128i initOffset = _mm_set1_epi32 (initvalue); + switch(bit) { + case 0: ipackFOR0(initOffset,in,out); break; + + case 1: ipackFOR1(initOffset,in,out); break; + + case 2: ipackFOR2(initOffset,in,out); break; + + case 3: ipackFOR3(initOffset,in,out); break; + + case 4: ipackFOR4(initOffset,in,out); break; + + case 5: ipackFOR5(initOffset,in,out); break; + + case 6: ipackFOR6(initOffset,in,out); break; + + case 7: ipackFOR7(initOffset,in,out); break; + + case 8: ipackFOR8(initOffset,in,out); break; + + case 9: ipackFOR9(initOffset,in,out); break; + + case 10: ipackFOR10(initOffset,in,out); break; + + case 11: ipackFOR11(initOffset,in,out); break; + + case 12: ipackFOR12(initOffset,in,out); break; + + case 13: ipackFOR13(initOffset,in,out); break; + + case 14: ipackFOR14(initOffset,in,out); break; + + case 15: ipackFOR15(initOffset,in,out); break; + + case 16: ipackFOR16(initOffset,in,out); break; + + case 17: ipackFOR17(initOffset,in,out); break; + + case 18: ipackFOR18(initOffset,in,out); break; + + case 19: ipackFOR19(initOffset,in,out); break; + + case 20: ipackFOR20(initOffset,in,out); break; + + case 21: ipackFOR21(initOffset,in,out); break; + + case 22: ipackFOR22(initOffset,in,out); break; + + case 23: ipackFOR23(initOffset,in,out); break; + + case 24: ipackFOR24(initOffset,in,out); break; + + case 25: ipackFOR25(initOffset,in,out); break; + + case 26: ipackFOR26(initOffset,in,out); break; + + case 27: ipackFOR27(initOffset,in,out); break; + + case 28: ipackFOR28(initOffset,in,out); break; + + case 29: ipackFOR29(initOffset,in,out); break; + + case 30: ipackFOR30(initOffset,in,out); break; + + case 31: ipackFOR31(initOffset,in,out); break; + + case 32: ipackFOR32(initOffset,in,out); break; + + default: break; + } +} + + + + +void simdunpackFOR(uint32_t initvalue, const __m128i * in, uint32_t * out, const uint32_t bit) { + __m128i initOffset = _mm_set1_epi32 (initvalue); + switch(bit) { + case 0: iunpackFOR0(initOffset, in,out); break; + + case 1: iunpackFOR1(initOffset, in,out); break; + + case 2: iunpackFOR2(initOffset, in,out); break; + + case 3: iunpackFOR3(initOffset, in,out); break; + + case 4: iunpackFOR4(initOffset, in,out); break; + + case 5: iunpackFOR5(initOffset, in,out); break; + + case 6: iunpackFOR6(initOffset, in,out); break; + + case 7: iunpackFOR7(initOffset, in,out); break; + + case 8: iunpackFOR8(initOffset, in,out); break; + + case 9: iunpackFOR9(initOffset, in,out); break; + + case 10: iunpackFOR10(initOffset, in,out); break; + + case 11: iunpackFOR11(initOffset, in,out); break; + + case 12: iunpackFOR12(initOffset, in,out); break; + + case 13: iunpackFOR13(initOffset, in,out); break; + + case 14: iunpackFOR14(initOffset, in,out); break; + + case 15: iunpackFOR15(initOffset, in,out); break; + + case 16: iunpackFOR16(initOffset, in,out); break; + + case 17: iunpackFOR17(initOffset, in,out); break; + + case 18: iunpackFOR18(initOffset, in,out); break; + + case 19: iunpackFOR19(initOffset, in,out); break; + + case 20: iunpackFOR20(initOffset, in,out); break; + + case 21: iunpackFOR21(initOffset, in,out); break; + + case 22: iunpackFOR22(initOffset, in,out); break; + + case 23: iunpackFOR23(initOffset, in,out); break; + + case 24: iunpackFOR24(initOffset, in,out); break; + + case 25: iunpackFOR25(initOffset, in,out); break; + + case 26: iunpackFOR26(initOffset, in,out); break; + + case 27: iunpackFOR27(initOffset, in,out); break; + + case 28: iunpackFOR28(initOffset, in,out); break; + + case 29: iunpackFOR29(initOffset, in,out); break; + + case 30: iunpackFOR30(initOffset, in,out); break; + + case 31: iunpackFOR31(initOffset, in,out); break; + + case 32: iunpackFOR32(initOffset, in,out); break; + + default: break; + } +} + + +uint32_t simdselectFOR(uint32_t initvalue, const __m128i *in, uint32_t bit, + int slot) { + const uint32_t * pin = (const uint32_t *) in; + if( bit == 0) { + return initvalue; + } else if (bit == 32) { + /* silly special case */ + return pin[slot]; + } else { + const int lane = slot % 4; /* we have 4 interleaved lanes */ + const int bitsinlane = (slot / 4) * bit; /* how many bits in lane */ + const int firstwordinlane = bitsinlane / 32; + const int secondwordinlane = (bitsinlane + bit - 1) / 32; + const uint32_t firstpart = pin[4 * firstwordinlane + lane] + >> (bitsinlane % 32); + const uint32_t mask = (1 << bit) - 1; + if (firstwordinlane == secondwordinlane) { + /* easy common case*/ + return initvalue + (firstpart & mask); + } else { + /* harder case where we need to combine two words */ + const uint32_t secondpart = pin[4 * firstwordinlane + 4 + lane]; + const int usablebitsinfirstword = 32 - (bitsinlane % 32); + return initvalue + + ((firstpart | (secondpart << usablebitsinfirstword)) + & mask); + } + } + +} + + + + +int simdsearchwithlengthFOR(uint32_t initvalue, const __m128i *in, uint32_t bit, + int length, uint32_t key, uint32_t *presult) { + int count = length; + int begin = 0; + uint32_t val; + while (count > 0) { + int step = count / 2; + val = simdselectFOR(initvalue, in, bit, begin + step); + if (val < key) { + begin += step + 1; + count -= step + 1; + } else count = step; + } + *presult = simdselectFOR(initvalue, in, bit, begin); + return begin; +} + +int simdpackFOR_compressedbytes(int length, const uint32_t bit) { + if(bit == 0) return 0;/* nothing to do */ + if(bit == 32) { + return length * sizeof(uint32_t); + } + return (((length + 3 )/ 4) * bit + 31 ) / 32 * sizeof(__m128i); +} + +__m128i * simdpackFOR_length(uint32_t initvalue, const uint32_t * in, int length, __m128i * out, const uint32_t bit) { + int k; + int inwordpointer; + __m128i P; + uint32_t firstpass; + __m128i offset; + if(bit == 0) return out;/* nothing to do */ + if(bit == 32) { + memcpy(out,in,length*sizeof(uint32_t)); + return (__m128i *)((uint32_t *) out + length); + } + offset = _mm_set1_epi32(initvalue); + inwordpointer = 0; + P = _mm_setzero_si128(); + for(k = 0; k < length / 4 ; ++k) { + __m128i value = _mm_sub_epi32(_mm_loadu_si128(((const __m128i * ) in + k)),offset); + P = _mm_or_si128(P,_mm_slli_epi32(value, inwordpointer)); + firstpass = sizeof(uint32_t) * 8 - inwordpointer; + if(bit -#include -#include -#include -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + #ifndef _WIN32 +#include + #endif + #ifdef _MSC_VER +#include "vs/getopt.h" + #else #include -#include -#include - -//#define _ZIGZAG // switch to zigzag encoding mode. Default = differential encoding -//#define _TRANSFORM // Transform functions : transpose,zigzag - -//#define IC_STATS -//---------------------------------------- Platform --------------------------- - #ifdef _WIN32 -#include -#define sleep(t) Sleep((t) * 1000) +#include + #endif + #if !defined(_WIN32) +#include +#include +#include +#include +#include + #else +#include +#include #define srand48(x) srand(x) #define drand48() ((double)(rand()) / RAND_MAX) #define __off64_t _off64_t - #elif defined(__APPLE__) -#define fopen64(a,b) fopen(a,b) - #endif -//---------------------------------------- Time ------------------------------- + #endif + +#include +#include "conf.h" +#include "plugins.h" +#include "vint.h" + +//--------------------------------------- Time ------------------------------------------------------------------------ typedef unsigned long long tm_t; -#define TM_TMAX (1ull<<63) - -#include -#include // sleep - #define TM_T 1000000.0 -static tm_t tmtime(void) { struct timeval tm; gettimeofday(&tm, NULL); return (tm_t)tm.tv_sec*1000000ull + tm.tv_usec; } -static tm_t tminit() { tm_t t0=tmtime(),ts; while((ts = tmtime())==t0) {}; return ts; } -static double tmsec( tm_t tm) { return (double)tm/1000000.0; } -static double tmmsec(tm_t tm) { return (double)tm/1000.0; } - -//--------------------------------------- TurboPFor ---------------------------- -#include "vint.h" -#include "vsimple.h" -#include "bitpack.h" -#include "bitunpack.h" -#include "vp4dc.h" -#include "vp4dd.h" -#include "eliasfano.h" -#include "bitutil.h" -#include "transpose.h" -#include "ext/ext.c" // external functions for comparison. uncomment if not needed - - #ifdef _ZIGZAG -#define bitdelta32( in, n, pa, start, mode) bitzigzag32( in, n, pa, start) -#define bitundx32(out, n, start, mode) bitunzigzag32(out, n, start) -#define bitd32( in, n, x) bitz32( in, n, x) -#define bitd132(in, n, x) bitz32( in, n, x) - -#define vbd1enc32(in, n, out, x) vbzenc32(in, n, out, x) -#define vbdenc32( in, n, out, x) vbzenc32(in, n, out, x) -#define vbd1encv32(in, n, out, x) vbzencv32(in, n, out, x) -#define vbdencv32( in, n, out, x) vbzencv32(in, n, out, x) - -#define vbd1dec32(in, n, out, x) vbzdec32(in, n, out, x) -#define vbddec32( in, n, out, x) vbzdec32(in, n, out, x) -#define vbd1decv32(in, n, out, x) vbzdecv32(in, n, out, x) -#define vbddecv32( in, n, out, x) vbzdecv32(in, n, out, x) - -#define bitd1pack32(in, n, out, x, b) bitzpack32(in, n, out, x, b) -#define bitdpack32( in, n, out, x, b) bitzpack32(in, n, out, x, b) -#define bitd1packv32(in, n, out, x, b) bitzpackv32(in, n, out, x, b) -#define bitdpackv32( in, n, out, x, b) bitzpackv32(in, n, out, x, b) - -#define bitd1unpack32(in, n, out, x, b) bitzunpack32(in, n, out, x, b) -#define bitdunpack32( in, n, out, x, b) bitzunpack32(in, n, out, x, b) -#define bitd1unpackv32(in, n, out, x, b) bitzunpackv32(in, n, out, x, b) -#define bitdunpackv32( in, n, out, x, b) bitzunpackv32(in, n, out, x, b) +#define TM_MAX (1ull<<63) + #ifdef _WIN32 +#include +static LARGE_INTEGER tps; +static tm_t tmtime(void) { LARGE_INTEGER tm; QueryPerformanceCounter(&tm); return (tm_t)((double)tm.QuadPart*1000000.0/tps.QuadPart); } +static tm_t tminit() { QueryPerformanceFrequency(&tps); tm_t t0=tmtime(),ts; while((ts = tmtime())==t0); return ts; } + #else +static tm_t tmtime(void) { struct timespec tm; clock_gettime(CLOCK_MONOTONIC, &tm); return (tm_t)tm.tv_sec*1000000ull + tm.tv_nsec/1000; } +static tm_t tminit() { tm_t t0=tmtime(),ts; while((ts = tmtime())==t0); return ts; } #endif +//---------------------------------------- bench ---------------------------------------------------------------------- +#define TM_MAX (1ull<<63) + +#define MIS 4000000.0 +#define TMIS(__l,__t) ((__t)>=0.000001?((double)(__l)/MIS)/((__t)/TM_T):0.0) + +#define MBS 1000000.0 +#define TMBS(__l,__t) ((__t)>=0.000001?((double)(__l)/MBS)/((__t)/TM_T):0.0) +#define TMDEF unsigned tm_r,tm_R,tm_c; tm_t _t0,_tc,_ts; double _tmbs=0.0; +#define TMSLEEP do { tm_T = tmtime(); if(!tm_0) tm_0 = tm_T; else if(tm_T - tm_0 > tm_TX) { printf("S \b\b\b");fflush(stdout); sleep(tm_slp); tm_0=tmtime();} } while(0) +#define TMBEG(_c_, _tm_reps_, _tm_Reps_) \ + for(tm_c=_c_,tm_tm = TM_MAX,tm_rm=1,tm_R=0,_ts=tmtime(); tm_R < _tm_Reps_; tm_R++) { printf("%8.2f %.2d_%d\b\b\b\b\b\b\b\b\b\b\b\b\b",_tmbs,tm_R+1,tm_c);fflush(stdout);\ + for(_t0 = tminit(), tm_r=0; tm_r < _tm_reps_;) { -unsigned char *u32enc(unsigned *__restrict in, int n, unsigned *__restrict out) { unsigned *ip; - #if 0 - memcpy(out,in,n*4); return (unsigned char *)(out+n); - #else - for(ip = in; ip != in+(n&~3); ) { - *out++ = *ip++; - *out++ = *ip++; - *out++ = *ip++; - *out++ = *ip++; +#define TMEND(_len_) tm_T = tmtime(); tm_r++; if((_tc = (tm_T - _t0)) > tm_tx) break; }\ + if(_tc/(double)tm_r < (double)tm_tm/(double)tm_rm) { tm_tm = _tc,tm_rm=tm_r; tm_c++; double _d = (double)tm_tm/(double)tm_rm; _tmbs=TMIS(_len_, _d); } else if(_tc/tm_tm>1.2) TMSLEEP; if(tm_T-_ts > tm_TX) break;\ + if((tm_R & 7)==7) { sleep(tm_slp); _ts=tmtime(); } } + +static unsigned tm_repc = 1<<30, tm_Repc = 3, tm_repd = 1<<30, tm_Repd = 3, tm_rm, tm_slp = 25; +static tm_t tm_tm, tm_tx = TM_T, tm_TX = 30*TM_T, tm_0, tm_T, tm_RepkT=24*3600*TM_T; + +//: b 512, kB 1000, K 1024, MB 1000*1000, M 1024*1024, GB 1000*1000*1000, G 1024*1024*1024 + +#define Kb (1u<<10) +#define Mb (1u<<20) +#define Gb (1u<<30) +#define KB 1000 +#define MB 1000000 +#define GB 1000000000 + +unsigned argtoi(char *s, unsigned def) { + char *p; + unsigned n = strtol(s, &p, 10),f = 1; + switch(*p) { + case 'K': f = KB; break; + case 'M': f = MB; break; + case 'G': f = GB; break; + case 'k': f = Kb; break; + case 'm': f = Mb; break; + case 'g': f = Gb; break; + case 'b': return 1u << n; + default: f = def; } - while(ip < in+n) *out++ = *ip++; - return (unsigned char *)out; - #endif + return n*f; } -unsigned char *u32dec(unsigned *__restrict in, int n, unsigned *__restrict out) { unsigned *op; - #if 0 - memcpy(out,in,n*4); return (unsigned char *)(in+n); - #else - for(op = out; op != out+(n&~3); ) { - *op++ = *in++; - *op++ = *in++; - *op++ = *in++; - *op++ = *in++; +unsigned long long argtol(char *s) { + char *p; + unsigned long long n = strtol(s, &p, 10),f=1; + switch(*p) { + case 'K': f = KB; break; + case 'M': f = MB; break; + case 'G': f = GB; break; + case 'k': f = Kb; break; + case 'm': f = Mb; break; + case 'g': f = Gb; break; + case 'b': return 1u << n; + default: f = MB; } - while(op < out+n) *op++ = *in++; - return (unsigned char *)in; - #endif + return n*f; } -#define PAD8(__x) (((__x)+7)/8) -unsigned char *_bitunpackx32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out , unsigned b) { unsigned i,k=0; for(i=0; i < n; i++,k+=b ) *out++ = _bitgetx32(in, b, k); return in + PAD8(n*b); } +unsigned long long argtot(char *s) { + char *p; + unsigned long long n = strtol(s, &p, 10),f=1; + switch(*p) { + case 'h': f = 3600000; break; + case 'm': f = 60000; break; + case 's': f = 1000; break; + case 'M': f = 1; break; + default: f = 1000; + } + return n*f; +} -// direct access functions included for demonstration only. Use the bulk functions instead, if you are decompressing all values -unsigned char *bitf1unpackx32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out, int start, unsigned b) { int i; for(i = 0; i < n; i++) out[i] = bitgetx32(in, b, i)+start+i+1; return in + PAD8(n*b); } -unsigned char *bitfunpackx32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, int start, unsigned b) { int i; for(i = 0; i < n; i++) out[i] = bitgetx32(in, b, i)+start; return in + PAD8(n*b); } +int strpref(char **str, int n, char sep1, char sep2) { + int i, j=0; + for(;;j++) + for(i = 0; i < n; i++) + if(!str[i][j] || str[i][j] != str[0][j]) { + while (j > 0 && str[0][j-1] != sep1 && str[0][j-1] != sep2) j--; + return j; + } + return 0; +} -//------------------------------------------------------------------------------------- -#define BLK_SIZE (64*1024) -unsigned char sbuf[BLK_SIZE*5+64]; - -#define PACK_SIZE 128 -// TurboPFor External functions -enum { P_CPY, // copy - P_VB, P_VBL, P_VG8, P_VBP, P_MVB, // variable byte - P_EFANO, // elias fano - P_PCK, P_PCKR, P_PCKV, P_SIMDV, P_FOR, // bit packing - P_SV, P_SVANS, P_S16, P_S64, // simple family: , simpleV, simple16, simple-8b - P_P4D, P_P4DR, P_OPTP4, // PFor, PForDelta - P_LIBFOR, // For - P_VSQMX, // QMX - P_LZT10, P_LZT20, P_LZT22, // LzTurbo - P_LZ4, // lz4 - P_BSHUF, P_BLZ, P_BLZ4, P_BZLIB, // https://github.com/Blosc/c-blosc - P_ZLIB1, P_ZLIB2, P_ZLIB3, P_ZLIB4, P_ZLIB5, P_ZLIB6, P_ZLIB7, P_ZLIB8, P_ZLIB9, - P_TRSP, P_TRSPV, P_BTSHUF, // transform - P_ZZAG, P_DELTA, - P_MAX +void memrcpy(unsigned char *out, unsigned char *in, unsigned n) { int i; for(i = 0; i < n; i++) out[i] = ~in[i]; } + +int memcheck(unsigned char *in, unsigned n, unsigned char *cpy, int cmp) { + int i; + if(cmp <= 1) + return 0; + for(i = 0; i < n; i++) + if(in[i] != cpy[i]) { + if(cmp > 3) abort(); // crash (AFL) fuzzing + printf("ERROR at %d:%x, %x\n", i, in[i], cpy[i]); + if(cmp > 2) exit(EXIT_FAILURE); + return i+1; + } + return 0; +} +//------------------------------- malloc ------------------------------------------------ +#define USE_MMAP + #if __WORDSIZE == 64 +#define MAP_BITS 30 + #else +#define MAP_BITS 28 + #endif + +void *_valloc(size_t size, int a) { + if(!size) return NULL; + #if defined(_WIN32) + return VirtualAlloc(NULL, size, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE); + #elif defined(USE_MMAP) + void *ptr = mmap((size_t)a< +static ALIGNED(char, mem_heap[1<<20],32); +static char *mem_heapp = mem_heap; +static size_t mem_peak, mem_used; + +static void *(*mem_malloc)(size_t); +static void *(*mem_calloc)(size_t, size_t); +static void *(*mem_realloc)(void*, size_t); +static void (*mem_free)(void *); +static void *(*mem_memalign)(size_t, size_t); + +static __attribute__((constructor)) void mem_init(void) { + mem_malloc = dlsym(RTLD_NEXT, "malloc" ); + mem_realloc = dlsym(RTLD_NEXT, "realloc"); + mem_free = dlsym(RTLD_NEXT, "free" ); + mem_calloc = dlsym(RTLD_NEXT, "calloc" ); + mem_memalign = dlsym(RTLD_NEXT, "memalign"); + if(!mem_malloc || !mem_calloc || !mem_realloc || !mem_free || !mem_memalign) + die("malloc not found\n"); +} + +size_t mempeak() { return mem_peak; } + +size_t mempeakinit() { mem_peak = mem_used = 0; return mem_peak; } + +void mem_add(size_t size) { + if((mem_used += size) > mem_peak) + { mem_peak = mem_used; } +} + +void mem_sub(size_t size) { + if(mem_used > size) + mem_used -= size; +} + +void *malloc(size_t size) { + if(!mem_malloc) { + void *p = mem_heapp; + if((mem_heapp += size) >= mem_heap+sizeof(mem_heap)) + die("malloc:initial memory overflow\n"); + return p; + } + void *p = (*mem_malloc)(size); + if(p) + mem_add(malloc_usable_size(p)); + return p; +} + +void *calloc(size_t nmemb, size_t size) { + size_t _size = nmemb*size; + if(!mem_calloc) { + void *p = mem_heapp; + if((mem_heapp += _size) >= mem_heap+sizeof(mem_heap)) + die("calloc:initial memory overflow\n"); + memset(p,0,_size); + return p; + } + void *p = (*mem_calloc)(nmemb, size); + if(p) + mem_add(malloc_usable_size(p)); + return p; +} + +void *memalign(size_t nmemb, size_t size) { + size_t _size = nmemb*size; + + mem_add(_size); + void *p = (*mem_memalign)(nmemb, size); + if(p) + mem_add(malloc_usable_size(p)); + return p; +} + +void *realloc(void *p, size_t size) { + mem_sub(malloc_usable_size(p)); + if(p = (*mem_realloc)(p, size)) + mem_add(malloc_usable_size(p)); + return p; +} + +void free(void *p) { + if(!p || p >= (void*)mem_heap && p < (void*)mem_heapp) + return; + mem_sub(malloc_usable_size(p)); + (*mem_free)(p); +} + #else +#define mempeak() +#define mempeakinit() 0 +void mem_add(size_t size) {} +void mem_sub(size_t size) {} + #endif + +//--------------------------------------- TurboBench ------------------------------------------------------------------ +enum { + FMT_TEXT=1, + FMT_HTML, + FMT_HTMLT, + FMT_MARKDOWN, + FMT_VBULLETIN, // ex. post to encode.ru + FMT_CSV, + FMT_TSV, + FMT_SQUASH }; -//------------------ random integer array (not sorted) --------------------------------- -unsigned char *beenc(unsigned *__restrict in, size_t n, unsigned char *__restrict out, int id, int b) { int i,xb; - switch(id&0x3f) { - //---------- copy ---------------------------------------------------- - case P_CPY: return u32enc( in, n, (unsigned *)out); - // --------- variable byte ------------------------------------------- - case P_VB: return vbenc32( in, n, out); - - case P_VBL: return vbyteenc( in, n, (unsigned *)out); - case P_VBP: return vbpolyenc(in, n, out); - #ifdef _MASKEDVBYTE - case P_MVB: return out + vbyte_encode(in, n, out); - #endif - #ifdef _VARINTG8IU - case P_VG8: return vintg8enc(in, n, out); - #endif - // --------- simple family: , simpleV, simple16, simple-8b ----------- - case P_SV: return vsenc32( in, n, out); - case P_S16: return vs16enc( in, n, (unsigned *)out); - case P_S64: return vs8benc( in, n, out); - case P_VSQMX: { unsigned char *q = qmx_enc(in, n, out+4); *(unsigned *)out = q - (out+4); return q; } - // --------- elias fano ---------------------------------------------- - case P_EFANO: return out; - // --------- PFor ---------------------------------------------------- - case P_P4DR: return p4denc32( in, n, out); - case P_P4D: return n == 128?p4dencv32(in, n, out):p4denc32(in, n, out); - - case P_OPTP4: if(n < 128) return vbyteenc(in, n, (unsigned *)out); - else { unsigned tmp[2048]; for(i = 0; i < n; i++) tmp[i] = in[i]; return out + OPT4(tmp, n, (unsigned *)out); } - // --------- bit packing --------------------------------------------- - case P_FOR : - case P_PCKR: - case P_PCK: if(b < 0) { BITSIZE32(in, n, b); *out++ = b; } return bitpack32(in, n, out, b); - case P_PCKV: if(b < 0) { BITSIZE32(in, n, b); *out++ = b; } return n != 128?bitpack32(in, n, out, b):bitpackv32(in, n, out, b); - #ifdef _LIBFOR - case P_LIBFOR: return out + for_compress_unsorted(in, out, n); - #endif - - case P_SIMDV: if(n < 128) return vbyteenc(in, n, (unsigned *)out); else { if(b < 0) b = maxbits(in), *out++ = b; return simdpackwn(in, n, b, (unsigned *)out); } +char *fmtext[] = { "txt", "txt", "html", "htm", "md", "vbul", "csv", "tsv", "squash" }; - // --------- transform ---------------------------------------- - #ifdef _TRANSFORM - case P_ZZAG: bitzigzag32(in, n, (unsigned *)out, 0); return out + n*4; - case P_TRSP: transpose4( (unsigned char *)in, n*4, out); return out + n*4; - case P_TRSPV: transpose4( (unsigned char *)in, n*4, out); return out + n*4; - #ifdef _BLOSC - case P_BSHUF: shuffle( 4, n*4, (unsigned char *)in, out); return out + n*4; - #endif - #ifdef _BTSHUF - case P_BTSHUF: bshuf_bitshuffle(in, out, n*4/32, 32, 0); return out + n*4; //bitshuffle(4, const size_t blocksize, in, out, NULL); - #endif - #endif +//------------- plugin : usage --------------------------------- +struct plugg { + char id[17],*desc,*s; +}; - // --------- transpose + lz77 ---------------------------------------- - #ifdef _LZT - case P_LZT10: { n *= 4; transpose4( (unsigned char *)in, n, sbuf); struct lzobj lz; lz.srclen = n; lz.src = sbuf; lz.dst = out; lz.dstlen = n; lz.level = 0; lz.hbits = 16; return out + lz8c01(&lz); } - case P_LZT20: { n *= 4; transpose4( (unsigned char *)in, n, sbuf); struct lzobj lz; lz.srclen = n; lz.src = sbuf; lz.dst = out; lz.dstlen = n; lz.level = 0; lz.hbits = 16; return out + lzbc01(&lz); } - case P_LZT22: { n *= 4; transpose4( (unsigned char *)in, n, sbuf); struct lzobj lz; lz.srclen = n; lz.src = sbuf; lz.dst = out; lz.dstlen = n; lz.level = 2; lz.hbits = 26; return out + lzbc2(&lz); } - #endif - #ifdef _LZ4 - case P_LZ4: //bshuf_bitshuffle(in, sbuf, n*4/32, 32, 0);// - transpose4( (unsigned char *)in, n*4, sbuf); - return out + LZ4_compress((char *)sbuf, (char *)out, n*4); - #endif - #ifdef _BLOSC - case P_BLZ: - case P_BLZ4: - case P_BZLIB: return out + blosc_compress(1/*clevel*/, 1/*doshuffle*/, 4/*typesize*/, n*4, in, out, n*4+BLOSC_MAX_OVERHEAD); - #endif - #ifdef _ZLIB - case P_ZLIB1: case P_ZLIB2: case P_ZLIB3: case P_ZLIB4: case P_ZLIB5: case P_ZLIB6: case P_ZLIB7: case P_ZLIB8: case P_ZLIB9: - { n *= 4; transpose4( (unsigned char *)in, n, sbuf); uLongf outlen = n; int rc = compress2(out+4, &outlen, sbuf, n, id-P_ZLIB1+1); if(rc != Z_OK) die("zlib compress2 rc=%d\n", rc); *(unsigned *)out = outlen; return out + 4 + outlen; } - #endif - case P_MAX ... 63: die("Fatal- Not entry %d", id); - } - return out; +struct plugg plugg[] = +{ + { "DEFAULT", "Default", "TurboPFor/TurboPFor256/TurboPackV256/TurboPackV/TurboVByte/TurboPack" }, + { "BENCH", "Benchmark", "TurboPFor/TurboPackV/TurboVByte/TurboPack/QMX/FP.SimdFastPfor/FP.SimdOptPFor/MaskedVbyte/StreamVbyte" }, + { "EFFICIENT","Efficient", "TurboPFor/vsimple/turbovbyte" }, + + { "BITPACK", "Bit Packing", "TurboPackV/TurboPack256V/TurboPackH/TurboPack/SC.SimdPack128/SC.SimdPack256" }, + { "VBYTE", "Variable byte", "TurboVByte/FP.VByte/PC.Vbyte/VarintG8IU/MaskedVbyte/StreamVbyte" }, + { "SIMPLE", "Simple Family", "simple8b/simple16/vsimple/qmx" }, + { "LZ4", "lz4+bitshufle/transpose 4/8", "lz4_bitshuffle/lz4_tp4/lz4_tp8" }, + + { "LI", "Little Integer", "LI.Pack/LI.TurboPack/LI.SuperPack/LI.HorPack/LI.BMIPack256" } +}; +#define PLUGGSIZE (sizeof(plugg)/sizeof(plugg[0])) + +void plugsprt(void) { + struct plugs *gs; + + struct plugg *pg; + printf("Codec group:\n"); + for(pg = plugg; pg < plugg+PLUGGSIZE; pg++) + printf("%-16s %s %s\n", pg->id, pg->desc); } -unsigned char *bedec(unsigned char *__restrict in, size_t n, unsigned *__restrict out, int id, int b) { - switch(id&0x3f) { - //--------- copy --------------------------------------------------- - case P_CPY: return u32dec( (unsigned *)in, n, out); - // --------- variable byte ------------------------------------------- - case P_VB: return vbdec32( in, n, out); +void plugsprtv(FILE *f, int fmt) { + struct plugs *gs; + char *pv = ""; - case P_VBL: return vbytedec( in, n, out); - case P_VBP: return vbpolydec(in, n, out); - #ifdef _MASKEDVBYTE - case P_MVB: return in + masked_vbyte_decode(in, out, n); - #endif - #ifdef _VARINTG8IU - case P_VG8: return vintg8dec(in, n, out); - #endif - // --------- simple family: simple16, simpleV, simple-8b --------------- - case P_SV: return vsdec32( in, n, out); - - case P_S16: return vs16dec( (unsigned *)in, n, out); - case P_S64: return vs8bdec( in, n, out); - case P_VSQMX: { unsigned l = *(unsigned *)in; return qmx_dec(in+4, l, out, n); } - // --------- elias fano ----------------------------------------------- - case P_EFANO: return in; - // --------- PFor ----------------------------------------------------- - case P_P4D : return n == 128?p4ddecv32(in, n, out):p4ddec32(in, n, out); - case P_P4DR : return p4ddecx32( in, n, out); - case P_OPTP4 : if(n < 128) return vbytedec(in, n, out); else { unsigned all_array[2048]; return (unsigned char *)detailed_p4_decode(out, (unsigned *)in, all_array); } - // --------- bit packing ------------------------------------------- - case P_FOR: if(b < 0) b = *in++; return bitfunpack32( in, n, out, 0, b); - case P_PCKR: if(b < 0) b = *in++; return _bitunpackx32( in, n, out, b); - case P_PCK: if(b < 0) b = *in++; return bitunpack32( in, n, out, b); - case P_PCKV: if(b < 0) b = *in++; return n != 128?bitunpack32(in, n, out, b):bitunpackv32(in, n, out, b); - - case P_SIMDV: if(n < 128) return vbytedec(in, n, out); else { if(b < 0) b = *in++; return simdunpackn( (unsigned *)in, n, b, out); } - #ifdef _LIBFOR - case P_LIBFOR: return in + for_uncompress(in, out, n); - #endif - - //---------- transpose + lz77 ---------------------- - #ifdef _TRANSFORM - case P_ZZAG: memcpy(out, in, n*4); bitunzigzag32(out, n, 0); return in + n*4; - case P_TRSP: untranspose4( (unsigned char *)in, n*4, (unsigned char *)out); return in + n*4; - case P_TRSPV: untranspose4( (unsigned char *)in, n*4, (unsigned char *)out); return in + n*4; - #ifdef _BLOSC - case P_BSHUF: unshuffle( 4, n*4, (unsigned char *)in, (unsigned char *)out); return in + n*4; - #endif - #ifdef _BTSHUF - case P_BTSHUF: bshuf_bitunshuffle(in, out, n*4/32, 32, 0); return in + n*4; - #endif - #endif - - //---------- transpose + lz77 ---------------------- - #ifdef _LZT - case P_LZT10: { struct lzobj lz; lz.dstlen = n*4; lz.src = in; lz.dst = sbuf; lz.level = 0; in += lz8d(&lz); untranspose4(sbuf, n*4, (unsigned char *)out); } break; - case P_LZT20: - case P_LZT22: { struct lzobj lz; lz.dstlen = n*4; lz.src = in; lz.dst = sbuf; lz.level = 0; in += lzbd(&lz); untranspose4(sbuf, n*4, (unsigned char *)out); } break; - #endif - #ifdef _LZ4 - case P_LZ4: in += LZ4_decompress_fast((char *)in, (char *)sbuf, n*4); //bshuf_bitunshuffle(sbuf, out, n*4/32, 32, 0); - untranspose4(sbuf, n*4, (unsigned char *)out); - break; - #endif - #ifdef _BLOSC - case P_BLZ: - case P_BLZ4: - case P_BZLIB: { blosc_decompress(in, out, n*4); size_t nbytes, cbytes,blocksize; blosc_cbuffer_sizes(in, &nbytes, &cbytes, &blocksize); return in+cbytes; } - #endif - #ifdef _ZLIB - case P_ZLIB1: case P_ZLIB2: case P_ZLIB3: case P_ZLIB4: case P_ZLIB5: case P_ZLIB6: case P_ZLIB7: case P_ZLIB8: case P_ZLIB9: - { uLongf outsize = n*4; int l = *(unsigned *)in, rc = uncompress(sbuf, &outsize, in+4, l); in += 4 + l; untranspose4(sbuf, n*4, (unsigned char *)out); } break; - #endif - case P_MAX ... 63: die("Fatal- Not entry %d", id); + switch(fmt) { + case FMT_HTMLT: + case FMT_HTML: + printf("%s\n", "TurboBench

"); + break; } - return out; -} - -unsigned char *besdec(unsigned char *__restrict in, size_t n, unsigned *__restrict out, int id, int mode) { unsigned b,x,v; - switch(id&0x3f) { - //------------- copy ------------------------------------------------------- - case P_CPY: in = u32dec( (unsigned *)in, n, out); break; - //------------- variable byte ---------------------------------------------- - case P_VB: return mode?vbd1dec32( in, n, out, -1):vbddec32(in, n, out, 0); - #ifndef _ZIGZAG - #ifdef _MASKEDVBYTE - case P_MVB: in += masked_vbyte_decode_delta(in, out, n, 0); break; - #endif - #endif +} - case P_VBL: in = vbytedec( in, n, out); bitundx32(out, n, -mode, mode); break; - case P_VBP: in = vbpolydec( in, n, out); bitundx32(out, n, -mode, mode); break; - #ifdef _VARINTG8IU - case P_VG8: in = vintg8dec( in, n, out); bitundx32(out, n, -mode, mode); break; - #endif - //------------- simple family ---------------------------------------------- - #ifdef _ZIGZAG - case P_SV: _vbget32(in, x, *out = x); in = vsdec32( in, n-1, out+1); bitundx32(out+1, n-1, x, mode); break; - #else - case P_SV: _vbget32(in, x, *out = x); in = vsdec32( in, n-1, out+1); bitundx32(out, n, -mode, mode); break; - #endif - case P_S16: _vbget32(in, x, *out = x); in = vs16dec((unsigned *)in, n-1, out+1); bitundx32(out, n, -mode, mode); break; - case P_S64: _vbget32(in, x, *out = x); in = vs8bdec( in, n-1, out+1); bitundx32(out, n, -mode, mode); break; - // ------------ elias fano ------------------------------------------------- - #ifndef _ZIGZAG - case P_EFANO: _vbget32(in, x,;); *out++ = x; --n; - if(mode) { return n==128?efano1decv32(in, n, out, x+1):efano1dec32( in, n, out, x+1); } - else { return n==128?efanodecv32( in, n, out, x ):efanodec32( in, n, out, x); } - #endif - // ------------ PFor ------------------------------------------------------- +//------------------ plugin: process ---------------------------------- +struct plug { + int id,err,blksize,lev; + char *s,prm[17],tms[20]; + long long len,memc,memd; + double tc,td,tck,tdk; +}; - #ifdef _ZIGZAG - case P_P4D: _vbget32(in, x, *out = x); in = n==129?p4ddecv32(in, n-1, out+1):p4ddec32(in, n-1, out+1); bitundx32(out+1, n-1, x, mode); break; - #else - case P_P4D: _vbget32(in, x, ;); *out++ = x; --n; - if(mode) { return n==128?p4dd1decv32(in, n, out, x):p4dd1dec32(in, n, out, x); } - else { return n==128?p4dddecv32( in, n, out, x):p4dddec32( in, n, out, x); } - case P_P4DR: _vbget32(in, x, *out = x); return mode?p4dfdecx32( in, n-1, out+1, x):p4df0decx32(in, n-1, out+1, x); - case P_OPTP4: - if(n < 129) in = vbytedec(in, n, out); - else { _vbget32(in, x, *out = x); unsigned all_array[2048]; in = (unsigned char *)detailed_p4_decode(out+1, (unsigned *)in, all_array); } - bitundx32(out, n, -mode, mode); break; - // ------------ bit packing ---------------------------------------- - case P_FOR: _vbget32(in, x, *out = x); b = *in++; return mode?bitf1unpack32( in, n-1, out+1, x, b):bitfunpack32( in, n-1, out+1, x, b); - case P_PCKR: _vbget32(in, x, *out = x); b = *in++; return mode?bitf1unpackx32(in, n-1, out+1, x, b):bitfunpackx32( in, n-1, out+1, x, b); - #endif - - case P_PCK: _vbget32(in, x, *out = x); b = *in++; return mode?bitd1unpack32( in, n-1, out+1, x, b):bitdunpack32( in, n-1, out+1, x, b); - case P_PCKV: _vbget32(in, x, *out = x); b = *in++; - if(n < 129) { return mode?bitd1unpack32( in, n-1, out+1, x, b):bitdunpack32( in, n-1, out+1, x, b); } - else { return mode?bitd1unpackv32(in, n-1, out+1, x, b):bitdunpackv32( in, n-1, out+1, x, b); } - - #ifndef _ZIGZAG - case P_SIMDV: - if(n < 129) { in = vbytedec(in, n, out); bitundx32(out, n, -mode, mode); } - else { _vbget32(in, x, *out = x); b = *in++; in = simdunpackn1((uint32_t *)in, n-1, b, out[0], out+1); } break; - #ifdef _LIBFOR - case P_LIBFOR: return in + for_uncompress(in, out, n); - #endif - #endif +struct plug plug[255],plugt[255]; +int seg_ans = 32*1024, seg_huf = 32*1024, seg_anx = 12*1024, seg_hufx=11*1024; +static int cmp = 2,trans, verbose=1; +double fac = 1.3; - //---------- transpose + lz77 ---------------------- - #ifdef _TRANSFORM - case P_ZZAG: memcpy(out, in, n*4); bitunzigzag32(out, n, 0); return in + n*4; - case P_TRSP: untranspose4( (unsigned char *)in, n*4, (unsigned char *)out); bitundx32(out, n, -mode, mode); return in + n*4; - case P_TRSPV: untranspose4((unsigned char *)in, n*4, (unsigned char *)out); bitundx32(out, n, -mode, mode); return in + n*4; - case P_DELTA: memcpy(out, in, n*4); bitundx32(out, n, -mode, mode); return in + n*4; - #endif +int plugins(struct plug *plug, struct plugs *gs, int *pk, unsigned bsize, int bsizex, int lev, char *prm) { + int i,k = *pk; + for(i = 0; i < k; i++) + if(plug[i].id == gs->id && plug[i].lev == lev && !strcmp(plug[i].prm,prm)) + return -1; - //---------- delta + transpose + lz77 ---------------------- - #ifdef _LZ4 - case P_LZ4: in += LZ4_decompress_fast((char *)in, (char *)sbuf, n*4); untranspose4(sbuf, n*4, (unsigned char *)out); bitundx32(out, n, -mode, mode); break; - #endif + memset(&plug[k], 0, sizeof(struct plug)); + plug[k].id = gs->id; + plug[k].err = 0; + plug[k].s = gs->s; + plug[k].lev = lev; + strncpy(plug[k].prm, prm?prm:(char *)"", 16); + plug[k].prm[16] = 0; + plug[k].tms[0] = 0; + plug[k].blksize = gs->blksize?gs->blksize:bsize; + *pk = ++k; + return 0; +} - #ifdef _ZLIB - case P_ZLIB1: case P_ZLIB2: case P_ZLIB3: case P_ZLIB4: case P_ZLIB5: case P_ZLIB6: case P_ZLIB7: case P_ZLIB8: case P_ZLIB9: - { uLongf outsize = n*4; int l = *(unsigned *)in, rc = uncompress(sbuf, &outsize, in+4, l); in += 4 + l; untranspose4(sbuf, n*4, (unsigned char *)out); bitundx32(out, n, -mode, mode); } break; - #endif +int plugreg(struct plug *plug, char *cmd, int k, int bsize, int bsizex) { + static char *cempty=""; + int ignore = 0; - #ifdef _LZT - case P_LZT10: { struct lzobj lz; lz.dstlen = n*4; lz.src = in; lz.dst = sbuf; lz.level = 0; in += lz8d(&lz); untranspose4(sbuf, n*4, (unsigned char *)out); bitundx32(out, n, -mode, mode); } break; - case P_LZT20: - case P_LZT22: { struct lzobj lz; lz.dstlen = n*4; lz.src = in; lz.dst = sbuf; lz.level = 0; in += lzbd(&lz); untranspose4(sbuf, n*4, (unsigned char *)out); bitundx32(out, n, -mode, mode); } break; - #endif + while(*cmd) { + while(isspace(*cmd)) + cmd++; + char *name = cmd; + while(isalnum(*cmd) || *cmd == '_' || *cmd == '-' || *cmd == '.') + cmd++; + if(*cmd) *cmd++ = 0; - case P_MAX ... 63: break; + if(!strcmp(name, "ON" )) { + ignore = 1; + continue; + } + else if(!strcmp(name, "OFF")) { + ignore = 0; + continue; + } + + for(;;) { + while(isspace(*cmd) || *cmd == ',') + cmd++; + + char *prm = cmd; + int lev = strtol(cmd, &cmd, 10); + if(prm == cmd) { + lev = -1; + prm = cempty; + } + else if(isalnum(*cmd)) { + prm = cmd; + while(isalnum(*cmd) || *cmd == '_' || *cmd == '-' || *cmd == '.') + cmd++; + if(*cmd) + *cmd++ = 0; + } else + prm = cempty; + + int found = 0; + struct plugs *gs,*gfs=NULL; + if(!*name) + break; + for(gs = plugs; gs->id >= 0; gs++) + if(gs->codec && !strcasecmp(gs->s, name) ) { + char s[33],*q; + sprintf(s,"%d", lev); + found++; + if(lev<0 && gs->lev && !gs->lev[0] || gs->lev && (q=strstr(gs->lev, s)) && (q==gs->lev || *(q-1) == ',')) { + found++; + plugins(plug, gs, &k, bsize, bsizex, lev, prm); + } + break; + } + if(found<2 && !ignore) { + if(!found) + fprintf(stderr, "codec '%s' not found\n", name); + else if(lev<0) + fprintf(stderr, "level [%s] not specified for codec '%s'\n", gs->lev, name ); + else if(gs->lev && gs->lev[0]) + fprintf(stderr, "level '%d' for codec '%s' not in range [%s]\n", lev, name, gs->lev); + else + fprintf(stderr, "codec '%s' has no levels\n", name); + exit(0); + } + while(isspace(*cmd)) + cmd++; + if(*cmd != ',' && (*cmd < '0' || *cmd > '9')) + break; + } } - return in; + a:plug[k].id = -1; + return k; +} + +//------------------ plugin: print/plot ----------------------------- +struct bandw { + unsigned long long bw; + unsigned rtt; + char *s; +}; + +static struct bandw bw[] = { + { 7*KB, 500, "GPRS 56" },//56kbps + { 57*KB, 150, "2G 456" }, + { 125*KB, 40, "3G 1M" }, + { 250*KB, 5, "DSL 2M" },//DSL 2000 + { 500*KB, 20, "4G 4M" }, + { 3750*KB, 5, "WIFI 30M" }, + {12500*KB, 5, "CAB 100M" }, + { 40*MB, 0, "USB2 40MB"}, + { 125*MB, 0, "ETH 1000" }, + { 200*MB, 0, "HDD 200MB"}, + { 550*MB, 0, "SSD 550MB"}, + { 1u*GB, 0, "SSD 1GB" }, + { 2u*GB, 0, "SSD 2GB" }, + { 4ull*GB, 0, "4GB/s" }, + { 8ull*GB, 0, "8GB/s" } +}; +#define BWSIZE (sizeof(bw)/sizeof(struct bandw)) + +void plugprth(FILE *f, int fmt, char *t) { + char *plot = ""; + char *jquery = ""; + char *tstyle = ""; + char *table = ""; + char *code = ""; + char s[128]; + time_t tm; + time(&tm); + sprintf(s, "TurboBench: %s - %s", t, asctime(localtime(&tm))); + + switch(fmt) { + case FMT_TEXT: + fprintf(f,"%s\n", s ); + break; + case FMT_VBULLETIN: + fprintf(f,"%s\n", s); + break; + case FMT_HTMLT: + fprintf(f,"TurboBench: %s - \n", s); + break; + case FMT_HTML: + fprintf(f,"TurboBench: %s - %s%s%s%s%s\n", s, plot, jquery, tstyle, table, code); + break; + case FMT_MARKDOWN: + fprintf(f,"#### %s (bold = pareto) MB=1.000.000\n", s); + break; + } +} + +void plugprtf(FILE *f, int fmt) { + switch(fmt) { + case FMT_HTML: + fprintf(f,"\n"); + break; + } +} + +void plugprtth(FILE *f, int fmt) { + char *head = " C Size ratio%% Bits/Int C MI/s D MI/s Name File (bold = pareto)"; + + switch(fmt) { + case FMT_TEXT: + fprintf(f," C Size ratio%% Bits/Int C MI/s D MI/s Name File\n"); + break; + case FMT_VBULLETIN: + fprintf(f,"[CODE][B]%s[/B] MB=1.000.0000\n", head); + break; + case FMT_HTMLT: + fprintf(f,"
%s MB=1.000.0000\n", head); 
+      break;
+    case FMT_HTML:     
+      fprintf(f,"

TurboBench: Compressor Benchmark

\n"); + break; + case FMT_MARKDOWN: + fprintf(f,"|C Size|ratio%%|Bits/Integer|C MI/s|D MI/s|Name|File|\n|--------:|-----:|--------:|--------:|----------------|----------------|\n"); + break; + case FMT_CSV: + fprintf(f,"size,csize,ratio,bpi,ctime,dtime,name,file\n"); + break; + case FMT_TSV: + fprintf(f,"size\tcsize\tratio\tbpi\tctime\tdtime\tname\tfile\n"); + break; + case FMT_SQUASH: + fprintf(f,"dataset,plugin,codec,level,compressed_size,compress_cpu,compress_wall,decompress_cpu,decompress_wall\n"); + break; + } +} + +void plugprttf(FILE *f, int fmt) { + switch(fmt) { + case FMT_VBULLETIN: + fprintf(f,"[/CODE]\n"); + break; + case FMT_HTMLT: + fprintf(f,"\n"); + break; + case FMT_HTML: + fprintf(f,"
C Sizeratio%%C MI/sD MI/sNameC MemD MemFile
\n"); + break; + case FMT_MARKDOWN: + fprintf(f,"\n\n"); + break; + } +} + +#define RATIO(_clen_, _len_) ((double)_clen_*100.0/_len_) +#define RATIOI(_clen_, _len_) ((double)_clen_*32.0/_len_) +#define FACTOR(_clen_, _len_) ((double)_len_/(double)_clen_) + +void plugprt(struct plug *plug, long long totinlen, char *finame, int fmt, double *ptc, double *ptd, FILE *f) { + double ratio = RATIO(plug->len,totinlen), ratioi = RATIOI(plug->len,totinlen), + //ratio = FACTOR(plug->len,totinlen), + tc = TMIS(totinlen,plug->tc), td = TMIS(totinlen,plug->td); + char name[65]; + if(plug->lev >= 0) + sprintf(name, "%s%s %d%s", plug->err?"?":"", plug->s, plug->lev, plug->prm); + else + sprintf(name, "%s%s%s", plug->err?"?":"", plug->s, plug->prm); + + int c = 0, d = 0, n = 0; + if(tc > *ptc) { c++; n++; *ptc = tc; } + if(td > *ptd) { d++; n++; *ptd = td; } + switch(fmt) { + case FMT_TEXT: + fprintf(f,"%12"PRId64" %5.1f %5.2f %8.2f %8.2f %-16s%s\n", + plug->len, ratio, ratioi, tc, td, name, finame); + break; + case FMT_VBULLETIN: + fprintf(f, "%12"PRId64" %5.1f %5.2f %s%8.2f%s %s%8.2f%s %s%-16s%s%s\n", + plug->len, ratio, ratioi, c?"[B]":"", tc, c?"[/B]":"", d?"[B]":"", td, d?"[/B]":"", n?"[B]":"", name, n?"[/B]":"", finame); + break; + case FMT_HTMLT: + fprintf(f, "%12"PRId64" %5.1f %5.2f %s%8.2f%s %s%8.2f%s %s%-16s%s%s\n", + plug->len, ratio, ratioi, c?"":"", tc, c?"":"", d?"":"", td, d?"":"", n?"":"", name, n?"":"", finame); + break; + case FMT_HTML: + fprintf(f, "%11"PRId64"%5.1f%5.2f%s%8.2f%s%s%8.2f%s%s%-16s%s%"PRId64"%"PRId64"%s\n", + plug->len, ratio, ratioi, c?"":"", tc, c?"":"", d?"":"", td, d?"":"", n?"":"", name, n?"":"", +// SIZE_ROUNDUP(plug->memc, Kb)/Kb, SIZE_ROUNDUP(plug->memd,Kb)/Kb, + plug->memc, plug->memd, + finame); + break; + case FMT_MARKDOWN: + fprintf(f, "|%"PRId64"|%5.1f|%5.2f|%s%.2f%s|%s%.2f%s|%s%s%s|%s|\n", + plug->len, ratio, ratioi, c?"**":"", tc, c?"**":"", d?"**":"", td, d?"**":"", n?"**":"", name, n?"**":"", finame); + break; + case FMT_CSV: + fprintf(f, "%12"PRId64",%11"PRId64",%5.1f,%5.2f,%8.2f,%8.2f,%-16s,%s\n", + totinlen, plug->len, ratio, ratioi, tc, td, name, finame); + break; + case FMT_TSV: + fprintf(f,"%12"PRId64"\t%11"PRId64"\t%5.1f\t5.2f\t%8.2f\t%8.2f\t%-16s\t%s\n", + totinlen, plug->len, ratio, ratioi, tc, td, name, finame); + break; + case FMT_SQUASH: + fprintf(f,"%12"PRId64",%11"PRId64",%5.1f,%8.2f,%8.2f,%-16s,%s\n", + finame, name, name, plug->len, tc, tc, td, td); + break; + } +} + +static int blknum, speedup; +enum { SP_SPEEDUPC=1, SP_SPEEDUPD, SP_TRANSFERC, SP_TRANSFERD }; + +void plugprtph(FILE *f, int fmt) { + int i; + + switch(fmt) { + case FMT_HTML: + fprintf(f,"

TurboBench: Speedup %s sheet

", (speedup&1)?"compression":"decompression"); + for(i = 0; i < BWSIZE; i++) + fprintf(f, "", bw[i].s); + fprintf(f, "\n"); + break; + case FMT_MARKDOWN: + fprintf(f,"#### TurboBench: Speedup %s sheet\n\n", (speedup&1)?"compression":"decompression"); + fprintf(f, "|Name"); + for(i = 0; i < BWSIZE; i++) + fprintf(f, "|%s", bw[i].s); + fprintf(f, "|File"); + if(blknum) + fprintf(f, " blknum=%d ", blknum); + fprintf(f, "|\n"); + fprintf(f, "|-------------"); + for(i = 0; i < BWSIZE; i++) + fprintf(f, "|---------:"); + fprintf(f, "|-------------|\n"); + break; + case FMT_VBULLETIN: + fprintf(f,"TurboBench: Speedup %s sheet\n\n", (speedup&1)?"compression":"decompression"); + fprintf(f,"[CODE][B]\n"); + default: + fprintf(f,"Name "); + for(i = 0; i < BWSIZE; i++) + fprintf(f, "%10s", bw[i].s); + if(blknum) + fprintf(f, " blknum=%d ", blknum); + fprintf(f, "\n"); + if(fmt == FMT_VBULLETIN) + fprintf(f,"[/B]\n"); + } +} + +static inline double spmbs(double td, long long len, int i, long long totinlen) { + double t = td + len*TM_T/(double)bw[i].bw + blknum*(bw[i].rtt*1000.0); + return TMIS(totinlen,t); +} + +//static inline double spdup(double td, long long len, int i, long long totinlen) { double t = td + len*TM_T/(double)bw[i].bw + blknum*(bw[i].rtt*1000.0); return ((double)totinlen*TM_T*100.0/t)/(double)bw[i].bw;} +static inline double spdup(double td, long long len, int i, long long totinlen) { + return (double)totinlen*100.0 / ((double)len + ((td+blknum*bw[i].rtt*1000.0)/TM_T)*(double)bw[i].bw ); +} + +void plugprtp(struct plug *plug, long long totinlen, char *finame, int fmt, int speedup, FILE *f) { + int i; + char name[65]; + if(plug->lev>=0) + sprintf(name, "%s%s%s%d%s", plug->err?"?":"", plug->s, fmt==FMT_MARKDOWN?"_":" ", plug->lev, plug->prm); + else + sprintf(name, "%s%s%s", plug->err?"?":"", plug->s, plug->prm); + if(fmt == FMT_HTML) + fprintf(f, "", name); + else + fprintf(f, "%-16s", name); + + for(i = 0; i < BWSIZE; i++) { + switch(fmt) { + case FMT_HTMLT: + case FMT_HTML: + fprintf(f, ""); + break; + case FMT_MARKDOWN: + break; + } + } + switch(fmt) { + case FMT_HTMLT: + case FMT_HTML: + fprintf(f, "\n", finame); + break; + case FMT_MARKDOWN: + fprintf(f, "|%s|\n", finame); + break; + default: + fprintf(f, "%s\n", finame); + break; + } +} + +struct { unsigned x,y; } divplot[] = { + { 1920, 1080}, // 16:9 + { 1600, 900}, + { 1280, 720}, + { 800, 600} +}; + +static unsigned divxy = 1, xlog = 1, xlog2, ylog, ylog2, plotmcpy; + +void plugplotb(FILE *f, int fmt, int idiv) { + fprintf(f, "
\n", + s, (speedup&1)?"Compression":"Decompression", xlog?"log":"", xlog?"type: 'log',\n":"", ylog?"type: 'log',\n":""); +} + +int libcmp(const struct plug *e1, const struct plug *e2) { + if(e1->len < e2->len) + return -1; + else if(e1->len > e2->len) + return 1; + else if(e1->td < e2->td) + return -1; + else if(e1->td > e2->td) + return 1; + return 0; +} + +int libcmpn(const struct plug *e1, const struct plug *e2) { + int c = strcmp(e1->s, e2->s); + if(c < 0) + return -1; + else if(c > 0) + return 1; + else if(e1->lev < e2->lev) + return -1; + else if(e1->lev > e2->lev) + return 1; + return 0; +} + +#define P_MCPY 1 // memcpy id +void plugplotc(struct plug *plug, int k, long long totinlen, int fmt, int speedup, char *s, FILE *f) { + int i, n = 0; + char name[65],txt[256]; + qsort(plug, k, sizeof(struct plug), (int(*)(const void*,const void*))libcmpn); + + struct plug *g,*gs=plug,*p; + for(txt[0] = name[0] = 0, g = plug; g < plug+k; g++) + if(g->id <= P_MCPY && !plotmcpy) + continue; + else { + if(strcmp(g->s, name)) { + if(name[0]) { + fprintf(f, "],\ny: ["); + for(p = gs; p < g; p++) + fprintf(f, "%.2f%s", speedup<3?FACTOR(p->len,totinlen):RATIO(p->len,totinlen), p+1s); + fprintf(f, "var %s = {\n x: [", g->s); + strcat(s, g->s); + } else { + fprintf(f, ","); + strcat(txt, ","); + } + if(g->lev >= 0) { + char ts[33]; + sprintf(ts, "'%s%s%d%s'", divxy>=2?"":g->s, divxy>=2?"":",", g->lev, g->prm); + strcat(txt, ts); + } + double t = (speedup&1)?g->tc:g->td; + fprintf(f, "%.2f", TMIS(totinlen,t)); + } + fprintf(f, "],\ny: ["); + for(p = gs; p < g; p++) + fprintf(f, "%.2f%s", speedup<3?FACTOR(p->len,totinlen):RATIO(p->len,totinlen), p+1\n", + s, (speedup&1)?"Compression":"Decompression", xlog2?"log":"", xlog2?"type: 'log',\n":"", ylog2?"type: 'log',\n":""); +} + +int plugprts(struct plug *plug, int k, char *finame, int xstdout, unsigned long long totlen, int fmt, char *t) { + double ptc = 0.0, ptd = 0.0; + struct plug *g; + if(!totlen) return 0; if(verbose>1) printf("'%s'\n", finame); + + qsort(plugt, k, sizeof(struct plug), (int(*)(const void*,const void*))libcmp); + char s[257]; + sprintf(s, "%s.%s", finame, fmtext[fmt]); + FILE *fo = xstdout>=0?stdout:fopen(s, "w"); + if(!fo) + die("file create error for '%s'\n", finame); + + plugprth( fo, fmt, t); + plugprtth(fo, fmt); + for(g = plugt; g < plugt+k; g++) + plugprt(g, totlen, finame, fmt, &ptc, &ptd, fo); + plugprttf(fo, fmt); + + if(speedup) { + switch(fmt) { + case FMT_TEXT : + fprintf(fo, "\n"); + break; + case FMT_HTML : + break; + case FMT_HTMLT: + fprintf(fo, "
\n");
+        break;
+      case FMT_MARKDOWN :
+        fprintf(fo, "\n"); 
+        break;
+    }
+    plugprtph(fo, fmt); 
+    for(g = plugt; g < plugt+k; g++) 
+      plugprtp(g, totlen, finame, fmt, speedup, fo);  
+    fprintf(fo, "\n"); 
+    switch(fmt) {
+      case FMT_TEXT : 
+        fprintf(fo, "\n"); break;
+      case FMT_HTML : 
+        fprintf(fo, "
Name%sFile"); + if(blknum) + fprintf(f, " blknum=%d ", blknum); + fprintf(f, "
%s"); + break; + case FMT_MARKDOWN: + fprintf(f, "|"); + break; + } + switch(speedup) { + case SP_TRANSFERD: + fprintf(f,"%9.3f ", spmbs(plug->td, plug->len, i, totinlen)); + break; + case SP_SPEEDUPD: + fprintf(f,"%9d ", (int)(spdup(plug->td, plug->len, i, totinlen)+0.5)); + break; + case SP_TRANSFERC: + fprintf(f,"%9.3f ", spmbs(plug->td, plug->len, i, totinlen)); + break; + case SP_SPEEDUPC: + fprintf(f,"%9d ", (int)(spdup(plug->td, plug->len, i, totinlen)+0.5)); + break; + } + switch(fmt) { + case FMT_HTMLT: + case FMT_HTML: + fprintf(f, "%s
\n"); break; + case FMT_HTMLT: + fprintf(fo, "
\n"); + break; + case FMT_VBULLETIN: + fprintf(fo,"[/CODE]\n"); + break; + case FMT_MARKDOWN : + fprintf(fo, "\n"); + break; + } + if(fmt == FMT_HTML) { + char s[1025]; + s[0] = 0; if(verbose>1) printf("generate speedup plot\n"); + plugplotb(fo, fmt, 1); + for(g = plugt; g < plugt+k; g++) + if(g->id > P_MCPY || plotmcpy) + plugplot(g, totlen, fmt, speedup, s, fo); + plugplote(fo, fmt, s); + + s[0] = 0; if(verbose>1) printf("generate speed/ratio plot\n"); + plugplotb(fo, fmt, 2); + plugplotc(plug, k, totlen, fmt, speedup, s, fo); + plugplotce(fo, fmt, s); + + } + } + plugprtf(fo, fmt); + fclose(fo); } +int plugread(struct plug *plug, char *finame, long long *totinlen) { + char s[256],name[33]; + struct plug *p=plug; + FILE *fi = fopen(finame, "r"); + if(!fi) return -1; + + fgets(s, 255, fi); + for(p = plug;;) { + p->tms[0] = 0; + int i = fscanf(fi, "%s\t%"PRId64"\t%"PRId64"\t%lf\t%lf\t%s\t%d\t%s\t%"PRId64"\t%"PRId64"\t%s\n", s, totinlen, &p->len, &p->td, &p->tc, name, &p->lev, p->prm, &p->memc, &p->memd, p->tms); + if(i != 11) + break; + if(p->prm[0]=='?') + p->prm[0]=0; + for(i = 0; plugs[i].id >=0; i++) + if(!strcmp(name, plugs[i].s)) { + p->s = plugs[i].s; + p->id = plugs[i].id; if(verbose>1) { fprintf(stdout, "%s\t%"PRId64"\t%"PRId64"\t%.6f\t%.6f\t%s\t%d%s\t%s\t%"PRId64"\t%"PRId64"\n", s, *totinlen, p->len, p->td, p->tc, p->s, p->lev, p->prm, p->tms, p->memc, p->memd); fflush(stdout); } + p++; + break; + } + } + fclose(fi); + return p - plug; +} + +//----------------------------------- Benchmark ----------------------------------------------------------------------------- +static int mcpy, mode, tincx, fuzz; + +int becomp(unsigned char *_in, unsigned _inlen, unsigned char *_out, unsigned outsize, unsigned bsize, int id, int lev, char *prm, int ifmt, CODCOMP codcomp) { + unsigned char *op,*oe = _out + outsize; if(!_inlen) return 0; + + if(ifmt >=0 && bsize == 4) { + unsigned *in = (unsigned *)_in,i; + for(i = 1; i < _inlen/4; i++) if(in[i] < in[i-1]+ifmt) die("IDs not sorted %d:%d,%d\n", i, in[i-1], in[i] ); + } + TMDEF; + TMBEG(0,tm_repc,tm_Repc); mempeakinit(); + unsigned char *in,*ip; + for(op = _out, in = _in; in < _in+_inlen; ) { + unsigned inlen,bs; + if(mode) { blknum++; + inlen = ctou32(in); in+=4; + vbput32(op, inlen); //ctou32(op) = inlen; op+=4;// + inlen*=4; + if(in+inlen>_in+_inlen) die("FATAL buffer overflow error"); //inlen = (_in+_inlen)-in; + } else inlen = _inlen; + + for(ip = in, in += inlen; ip < in; ) { + unsigned iplen = in - ip; iplen = min(iplen, bsize); + op = codcomp(ip, iplen, op, oe-op, id, lev, prm, ifmt); + ip += iplen; + if(op > _out+outsize) + die("Overflow error %llu, %u in lib=%d\n", outsize, (int)(ptrdiff_t)(op - _out), id); + } + } + TMEND(_inlen); + return op - _out; +} + +int bedecomp(unsigned char *_in, int _inlen, unsigned char *_out, unsigned _outlen, unsigned bsize, int id, int lev, char *prm, int ifmt, CODDECOMP coddecomp) { + unsigned char *ip; + TMDEF; + TMBEG(0,tm_repd,tm_Repd); mempeakinit(); + unsigned char *out,*op; + for(ip = _in, out = _out; out < _out+_outlen;) { + unsigned outlen,bs; + if(mode) { + vbget32(ip, outlen); //outlen = ctou32(ip); ip+=4; // + ctou32(out) = outlen; out += 4; + outlen *= 4; + if(out+outlen >_out+_outlen) die("FATAL: overflow error"); + } else outlen = _outlen; + for(op = out, out += outlen; op < out; ) { + unsigned oplen = out - op; + oplen = min(oplen, bsize); + ip = coddecomp(ip, 0, op, oplen, id, lev, prm, ifmt); + if(ip >_in+_inlen) die("FATAL inlen"); + op += oplen; + } + } + if(!(ip - _in)) return 0; + TMEND(_outlen); + return ip - _in; +} + + #ifdef LZTURBO +#include "../bebench.h" + #else +struct plug plugr[32]; int tid; +#define BEPRE +#define BEINI +#define BEPOST +#define BEOPT +#define BEUSAGE +#define BEFILE +#define BENCHSTA +#endif + +#define INOVD 4*1024 + + #if defined(_WIN32) && !defined(__MINGW__) +int getpagesize() { + static int pagesize = 0; + if (pagesize == 0) { + SYSTEM_INFO system_info; + GetSystemInfo(&system_info); + pagesize = max(system_info.dwPageSize, system_info.dwAllocationGranularity); + } + return pagesize; +} + #endif + +unsigned mininlen; +struct cod { CODCOMP comp; CODDECOMP decomp; }; +struct cod cods[] = { { codcomp, coddecomp }, {codcomps, coddecomps }}; + +unsigned long long filen; +size_t insizem; +char name[65]; +int ifmt=-1,mdelta=0; + +unsigned long long plugbench(struct plug *plug, unsigned char *_in, unsigned inlen, unsigned _insize, unsigned char *out, unsigned outsize, + unsigned char *_cpy, unsigned _bsize, struct plug *plugr, int tid, int krep, char *finame) { + unsigned char *in = _in; + //if(fuzz & 1) { in = (_in+insizem)-inlen; memmove(in, _in, inlen); } + double tc = 0.0, td = 0.0; + unsigned l = inlen, outlen, bsize = plug->blksize?plug->blksize:_bsize; + int insize=(ifmt >= 0)?bsize+4:bsize; + + BEPRE; + int nb = 1; + /*if(l < mininlen) { + bsize = l; + unsigned char *p; + for(p = in+l; ; p+=l) { + if(p+l > in+insize) break; + nb++; + memcpy(p, in, l); + } + }*/ + size_t peak = mempeakinit(); + outlen = becomp(in, l*nb, out, outsize, insize, plug->id, plug->lev, plug->prm, ifmt, cods[ifmt<0?0:1].comp)/nb; + plug->len += outlen; + plug->tc += (tc += (double)tm_tm/((double)tm_rm*nb)); + plug->memc = mempeak() - peak; + if(tm_Repc > 1) + TMSLEEP; + + if(verbose && inlen == filen) { printf("%12u %5.1f %5.2f %8.2f ", outlen, RATIO(outlen,inlen), RATIOI(outlen,inlen), TMIS(inlen,tc)); fflush(stdout); } + if(cmp) { + unsigned char *cpy = _cpy; + if(fuzz & 2) cpy = (_cpy+insizem) - l; + if(_cpy != _in) memrcpy(cpy, in, l); + peak = mempeakinit(); + unsigned cpylen = bedecomp(out, outlen, cpy, l*nb, insize, plug->id, plug->lev, plug->prm, ifmt, cods[ifmt<0?0:1].decomp)/nb; + td = (double)tm_tm/((double)tm_rm*nb); + plug->memd = mempeak() - peak; if(verbose && inlen == filen) { printf("%8.2f %-16s%s\n", TMIS(inlen,td), name, finame); } + int e = memcheck(in, l, cpy, fuzz?3:cmp); + plug->err = plug->err?plug->err:e; + BEPOST; + plug->td += td; + } else if(verbose && inlen == filen) { printf("%8.2f %-16s%s\n", 0.0, name, finame); } + return outlen; +} + //--------------------------------------- Zipfian generator -------------------------------------------------------- +double a = 1.5; + unsigned xbits[33]; +void stprint(char *s) { + int m; + unsigned long long t=0; + for(m = 0; m < 33; m++) + t += xbits[m]; + printf("\n%s bits histogram:",s); + for(m = 0; m < 33; m++) + if(xbits[m]) printf("%d:%.2f%% ", m, (double)xbits[m]*100/t); printf("\n"); +} + int dcmp(double *a, double *b) { if(*a < *b) return -1; if(*a > *b) return 1; @@ -527,633 +1195,481 @@ void zipfgen(unsigned *a, double alpha, unsigned x1, unsigned x2, int n) { } free(zmap); } -//---------------------------------------- bench -------------------------- -#define TM_MAX (1ull<<63) -#define TMPRINT(__x) { printf("%7.2f MB/s\t%s", (double)(tm_tm>=0.000001?(((double)n*tm_rm/MBS)/(((double)tm_tm/1)/TM_T)):0.0), __x); fflush(stdout); } -#define TMDEF unsigned tm_r,tm_R; tm_t _t0,_tc -#define TMBEG for(tm_tm = TM_MAX,tm_R = 0; tm_R < tm_Reps; tm_R++) { if(tm_R && tm_slp) sleep(tm_slp); for(_t0 = tminit(), tm_r=0; tm_r < tm_reps;) { -#define TMEND tm_r++; if((_tc = (tmtime() - _t0)) > tm_tx) break; } if(_tc < tm_tm) { tm_tm = _tc,tm_rm=tm_r; if(_tc>tm_Tx) break; } } -#define MBS 1000000.0 //MiBS 1048576.0 -static unsigned tm_reps = 1<<30, tm_Reps = 3, tm_rm, tm_slp; -static tm_t tm_tm, tm_tx=2*TM_T,tm_Tx=60*TM_T; -void memrcpy(unsigned char *__restrict out, unsigned char *__restrict in, size_t n) { int i; for(i = 0; i < n; i++) out[i] = ~in[i]; } -unsigned argtoi(char *s) { - char *p; unsigned n = strtol(s, &p, 10),f=1; - switch(*p) { - case 'k': f = 1000; break; - case 'm': f = 1000000; break; - case 'g': f = 1000000000; break; - case 'K': f = 1<<10; break; - case 'M': f = 1<<20; break; - case 'G': f = 1<<30; break; - } - return n*f; -} -#define ALGN - - #ifdef ALGN -static void *amalloc(size_t size, size_t align) { - #if defined(__MINGW32__) - return __mingw_aligned_malloc(size, align); - #elif defined(_WIN32) - return _aligned_malloc(__size, __align); - #elif _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 - void *p; - return posix_memalign(&p, align, size)?NULL:p; - #elif !defined(__APPLE__) && __STDC_VERSION__ >= 201112L - return aligned_alloc(16, size); - #else - return malloc(size); - #endif -} - -void afree(void *p) { - #if defined(__MINGW32__) - __mingw_aligned_free(p); - #elif defined(_WIN32) - _aligned_free(p); - #elif _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 - free(p); - #elif !defined(__APPLE__) && __STDC_VERSION__ >= 201112L - aligned_free(p); - #else - free(size); - #endif -} - #endif -//----------------------------------------------- Benchmark ------------------- -struct libss { int id; char *s;int size; }; - -struct libss libss[] = { - //------- PFor + PForDelta ---------- - { P_P4D, "TurboPFor", 128 }, - #if !defined(_WIN32) - { P_P4DR, "TurboPForDA", 128 }, // actually not working w. mingw - #endif - - #ifdef _OPTPFD - { P_OPTP4, "OptPFD", 128 }, //max. 28 bits - #endif - - //-------------- Bit Packing -------- - { P_PCKV, "TurboPackV", 128 }, - { P_PCK, "TurboPack", PACK_SIZE }, - { P_FOR, "TurboFor", PACK_SIZE }, - { P_PCKR, "TurboForDA", PACK_SIZE }, - - { P_SIMDV, "SIMDPackFPF", 128 }, - #ifdef _LIBFOR - { P_LIBFOR, "LibFor", PACK_SIZE }, - #endif - - //------ Variable byte ------------- - { P_VB, "TurboVbyte" }, - - { P_VBL, "VbyteFPF" }, - { P_VG8, "VarintG8IU" }, - #ifdef _MASKEDVBYTE - { P_MVB, "MaskedVByte" }, - #endif - #ifdef _VBYTEPOLY - { P_VBP, "VBytePoly" }, - #endif - - #ifdef _QMX - { P_VSQMX, "qmx" }, - #endif - // ----- Simple family ----- - { P_SV, "VSimple" }, -// { P_SVANS, "VSimpleANS", BLK_SIZE }, - #ifdef _SIMPLE_8B - { P_S64, "Simple-8b", SIMPLE8BMAX }, //crash on integers w. size 32 bits ! - #endif - #ifdef _SIMPLE16 - { P_S16, "Simple16" }, //max. 28 bits - #endif - - //------- Elias Fano --------- - { P_EFANO, "EliasFano" }, - { P_CPY, "Copy" }, - - // ------ transpose/delta + lz77 ------- - #ifdef _LZT - { P_LZT10, "LzTurbo 10", BLK_SIZE }, -// { P_LZT20, "LzTurbo 20", BLK_SIZE }, -// { P_LZT22, "LzTurbo 22", BLK_SIZE }, - #endif - - #ifdef _LZ4 - { P_LZ4, "lz4", BLK_SIZE }, - #endif - - #ifdef _BLOSC - { P_BLZ, "blosc_lz", BLK_SIZE }, - { P_BLZ4, "blosc_lz4", BLK_SIZE }, - { P_BZLIB, "blosc_zlib", BLK_SIZE }, - #endif - - #ifdef _ZLIB - { P_ZLIB1, "zlib 1", BLK_SIZE }, -//{ P_ZLIB2, "zlib 2", BLK_SIZE }, -//{ P_ZLIB3, "zlib 3", BLK_SIZE }, -//{ P_ZLIB4, "zlib 4", BLK_SIZE }, - { P_ZLIB5, "zlib 5", BLK_SIZE }, -//{ P_ZLIB6, "zlib 6", BLK_SIZE }, -//{ P_ZLIB7, "zlib 7", BLK_SIZE }, -//{ P_ZLIB8, "zlib 8", BLK_SIZE }, - { P_ZLIB9, "zlib 9", BLK_SIZE }, - #endif - - //------- Transform ------------------- - #ifdef _TRANSFORM - { P_TRSP, "transpose", BLK_SIZE }, - { P_TRSPV, "transposev", BLK_SIZE }, - { P_ZZAG, "zigzag", BLK_SIZE }, - { P_DELTA, "delta", BLK_SIZE }, - #ifdef _BLOSC - { P_BSHUF, "shuffle", BLK_SIZE }, - #endif - #ifdef _BTSHUF - { P_BTSHUF, "bshuffle", BLK_SIZE }, - #endif - #endif - { -1, "" }, -}; - -#define MB 1000000 -int verb = 0, xcheck=2; unsigned xbits[33]; - -enum { T_TST, T_TXT, T_CHAR, T_BYTE, T_BYTE2, T_BYTE4, T_DBL }; - -struct libs { int id,err,size; char *s,*v; unsigned long long l, c[33]; double tc,td; }; -struct libs libs[64],slibs[64]; -void libini() { int m; for(m = 0; libs[m].id >= 0; m++) libs[m].l = libs[m].tc = libs[m].td = 0; } - -int l_cmp(struct libs *a, struct libs *b) { - if(a->l < b->l || (a->l == b->l && a->td < b->td) || (a->l == b->l && a->td == b->td && a->tc < b->tc)) return -1; - if(a->l > b->l || (a->l == b->l && a->td > b->td) || (a->l == b->l && a->td == b->td && a->tc > b->tc)) return 1; - return 0; -} - -void check(unsigned *in, unsigned n, unsigned *out, char *s) { - unsigned k,j; - for(k = 0; k < n; k++) - if(in[k] != out[k]) { - printf("\nFATAL in check %x,%x at %u[%u] in %s\n", in[k], out[k], k, n, s); - for(j=k & 0xffffff80u; j < k+128;j++) { unsigned e = in[j] != out[j]; - if(e) printf("*"); printf("%d:%x,%x ", j, in[j], out[j] ); +// 0 1 2 3 4 5 6 7, 8 +enum { T_TST, T_UINT8, T_UINT16, T_UINT24, T_UINT32, T_UINT40, T_UINT48, T_UINT56, T_UINT64, T_CHAR, T_TXT }; +int mdelta; +unsigned rm=0,rx=1<<20,n=0; +#define OVD (10*MB) +#define IPUSH(in,n,isize, nmax,u) { if(n >= nmax) { nmax = nmax?(nmax << 1):(1<<20); in = realloc(in, nmax*isize+OVD); if(!in) die("malloc err=%u", nmax); }\ + ctou32(in+n*isize) = u; n++; \ + } +unsigned befgen(unsigned char **_in, int fmt, unsigned isize, FILE *fi) { + unsigned char *in = *_in,*ip; unsigned nmax = 0; + if(!fi) { + if(!n) n = 25000000; printf("zipf alpha=%.2f range[%u..%u].n=%u\n ", a, rm, rx, n); + in = malloc(n*isize+OVD); if(!in) die("malloc err=%u", nmax); + zipfgen((unsigned *)in, a, rm, rx, n); + int i;for(i = 1; i <= n; i++) xbits[bsr32(ctou32(in+i*4))]++; + if(ifmt >= 0) { stprint("delta"); + unsigned *ip = (unsigned *)in; int v; + for(ip[0]=0,v = 1; v < n; v++) { + ip[v] += ip[v-1] + ifmt; if(ip[v]>=(1u<<31)) die("overflow generating sorted array\n" ); } - printf("\n"); - exit(0); - } -} - -void stprint(char *s) { - int m; - unsigned long long t=0; - for(m = 0; m < 33; m++) - t += xbits[m]; - printf("\n%s bits histogram:",s); - for(m = 0; m < 33; m++) - if(xbits[m]) printf("%d:%.2f%% ", m, (double)xbits[m]*100/t); printf("\n"); -} - -#define BI 1 // BI=4 -> MB/S=Megabyte/Sec BI=1 -> Millions integer/Sec -void print(struct libs *libs, unsigned long long n, char *s, unsigned long long *u) { - int m, k; - for(k = 0; libs[k].id >= 0; k++) {}; - qsort(libs, k, sizeof(libs[0]), (int(*)(const void*,const void*))l_cmp); - char *prtname = s?s:""; { char *p; if((p = strrchr(prtname, '/')) || (p = strrchr(prtname, '\\'))) prtname = p+1;} - for(m = 0; m < k; m++) - if(/*libs[m].tc ||*/ libs[m].l) { - struct libs *lb = &libs[m]; if(!lb->l) lb->tc=lb->td=0.0; - printf("%-16s%12llu\t%5.2f\t%5.2f\t%8.2f\t%8.2f\t%s\n", prtname, lb->l, (double)lb->l*100.0/((double)n*4.0), (double)lb->l*8.0/(double)n, - lb->tc>=0.000001?((double)n*BI/1000000.0) / (lb->tc/TM_T):0.0, - lb->td>=0.000001?((double)n*BI/1000000.0) / (lb->td/TM_T):0.0, - lb->s ); - } -} - -//----------------------------------------------------------------------------------------------- -unsigned bench(unsigned *__restrict _in, unsigned _inlen, int blksize, unsigned char *__restrict _out, unsigned long long outsize, char *inname, tm_t tx, unsigned *__restrict cpy, int bb, int mode ) { int m,id,b=bb,i; - if(!_inlen) return 0; if(verb>1) { printf(":%d,", _inlen); fflush(stdout); } - unsigned cn; - memcpy(_out, _in, (unsigned long long)_inlen*4); - for(m = 0; (id=libs[m].id) >= 0; m++) { - unsigned bsize = libs[m].size?libs[m].size:blksize, cl; - int insize=(mode>=0)?bsize+1:bsize; - struct libs *lb = &libs[m]; if(verb) printf("%s,%d", libs[m].s, insize); - #ifdef _BLOSC - if(lb->id == P_BLZ4) - blosc_set_compressor(BLOSC_LZ4_COMPNAME); - else if(lb->id == P_BZLIB) - blosc_set_compressor(BLOSC_ZLIB_COMPNAME); - else if(lb->id == P_BLZ) - blosc_set_compressor(BLOSC_BLOSCLZ_COMPNAME); - #endif - memrcpy((unsigned char *)_out, (unsigned char *)_in, _inlen*4); - if(cpy !=_in) memrcpy((unsigned char *)cpy, (unsigned char *)_in, _inlen*4); - - TMDEF; TMBEG - cn = cl = 0; - unsigned *in; - unsigned char *out; - for(out = _out, in = _in; in < _in+_inlen; ) { - unsigned n, inlen = *in++, *ip = in; in += inlen; cn += inlen; - *(unsigned *)out = inlen; out += 4; unsigned char *sout = out; - if(mode >= 0) - for(;ip < in; ip += n) { n = in - ip; n = min(n,insize); - out = besenc(ip, n, out, id, mode); - } - else - for(;ip < in; ip += n) { n = in - ip; n = min(n,insize); - out = beenc(ip, n, out, id, bb); - } - if(out > _out+outsize) die("Overflow error %llu, %u in %s\n", outsize, (int)(ptrdiff_t)(out - _out), lb->s); - cl += out - sout; - } - TMEND if(verb) { printf("/");fflush(stdout); } - lb->l += cl; lb->tc += (double)tm_tm/tm_rm; //printf("(%d %.2f,%2u) ", cl, (double)tm_tm, tm_rm); - - if(xcheck) { - TMDEF; TMBEG - unsigned *out; - unsigned char *in; - for(out = cpy, in = _out; out < cpy+_inlen;) { - unsigned n,*op, outlen = *(unsigned *)in; in += 4; - *out++ = outlen; - if(mode >= 0) - for(op=out,out += outlen; op < out; op += n) { n = out - op; n = min(n,insize); - in = besdec(in, n, op, id, mode); - } - else - for(op=out,out += outlen; op < out; op += n) { n = out - op; n = min(n,insize); - in = bedec(in, n, op, id, bb); - } - } - TMEND - lb->td += (double)tm_tm/tm_rm; //printf("(%d %.2f,%2u) ", cl, (double)tm_tm, tm_rm); printf("(%d,%d %.2f,%.2f) ", cl, lb->l, lb->tc, lb->td); - if(xcheck > 1 && lb->l) check(_in, _inlen, cpy, lb->s); - } + } else stprint(""); + *_in = in; + return n*isize; + } + { + unsigned n = 0; //printf("fmt=%d,", fmt);fflush(stdout); + #define LSIZE 1024 + char s[LSIZE+1]; + switch(fmt) { + case T_TXT: + while(fgets(s, LSIZE, fi)) { + s[strlen(s) - 1] = 0; + unsigned long long u = strtoull(s, NULL, 10) - mdelta; + IPUSH(in,n,isize,nmax,u); + } + break; + case T_CHAR: + for(;;) { + char *p = s; + int c; + while((c = getc(fi)) >= '0' && c <= '9') + if(p - s < LSIZE) *p++ = c; + *p = 0; + unsigned long long u = strtoull(s, NULL, 10) - mdelta; + IPUSH(in,n,isize,nmax,u); + if(c == EOF) break; + } + break; + case T_UINT8: { + unsigned char u; + while(fread(&u, 1, 1, fi)>0) + IPUSH(in,n,isize,nmax, u-mdelta); + } break; + case T_UINT16: { + unsigned short u; + while(fread(&u, sizeof(u), 1, fi)>0) + IPUSH(in,n,isize,nmax, u-mdelta); + } break; + /*case T_UINT32: { + unsigned u; + while(fread(&u, sizeof(u), 1, fi)>0) + IPUSH(in,n,isize,nmax, u-mdelta); + } break;*/ + default: die("unkown data format\n"); + } + *_in = in; + return n*isize; } - return cn; } -void usage() { - fprintf(stderr, "\nTurboPFor Copyright (c) 2013-2015 Powturbo %s\n", __DATE__); - fprintf(stderr, "Usage: icbench [options] [file]\n"); - fprintf(stderr, "Use zipfian generator when no file specified\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " -bNs N = blocksize (default 128) ex. -b64k -b64K\n"); - fprintf(stderr, " -cN N = 0,1 delta for ordered integer lists)/9,10,11=1 integer per line/19,20,21:integers separated by non digit char\n"); - fprintf(stderr, " -eS S = encoder schemes sparated by / (default all)\n"); - fprintf(stderr, " -vN N = verbosity 1..3\n"); - fprintf(stderr, " -tN N = time in seconds per interation\n"); - fprintf(stderr, " -RN N = Iterations (default 3)\n"); - fprintf(stderr, "----- file specified --------------\n"); - fprintf(stderr, " -fN N = file format: 1=text:one integer per line 2=text:integers separated by non digit char, 3=bytes\n"); - fprintf(stderr, " -FNs N = max. file size to read\n"); - fprintf(stderr, "Ex. ./icbench -c1 gov2.sorted -eturbopack/turbopfor\n"); - fprintf(stderr, "----- file not specified --------------\n"); - fprintf(stderr, " -fN N = 0:bits range test. Ex. ./icbench -f0 -n0 -M32 -ebitpack\n"); +void usage(char *pgm) { + fprintf(stderr, "\nIcBench Copyright (c) 2013-2017 Powturbo %s\n", __DATE__); + fprintf(stderr, "Usage: %s [options] [file]\n", pgm); + fprintf(stderr, " -eS S = compressors/groups separated by '/' Parameter can be specified after ','\n"); + fprintf(stderr, " -b#s # = blocksize (default filesize,). max=1GB\n"); + fprintf(stderr, " -B#s # = max. benchmark filesize (default 1GB) ex. -B4G\n"); + fprintf(stderr, " -s#s # = min. buffer size to duplicate & test small files (ex. -s50)\n"); + fprintf(stderr, " s = modifier s:K,M,G=(1000, 1.000.000, 1.000.000.000) s:k,m,h=(1024,1Mb,1Gb). (default m) ex. 64k or 64K\n"); + fprintf(stderr, "Benchmark:\n"); + fprintf(stderr, " -i#/-j# # = Minimum de/compression iterations per run (default=auto)\n"); + fprintf(stderr, " -I#/-J# # = Number of de/compression runs (default=3)\n"); + fprintf(stderr, " -t# # = min. time in seconds per run.(default=2sec)\n"); + fprintf(stderr, " -S# Sleep # min. after 2 min. processing mimizing CPU trottling\n"); + fprintf(stderr, " -k# Repeat all benchmarks # times (default=3). -k0 = test mode\n"); + fprintf(stderr, " -K#t Max. time limit for all benchmarks (default 24h)\n"); + fprintf(stderr, " t = M:millisecond s:second m:minute h:hour. ex. 3h\n"); + fprintf(stderr, " -D No process real-time priority setting\n"); + fprintf(stderr, "Check:\n"); + fprintf(stderr, " -C# #=0 compress only, #=1 ignore errors, #=2 exit on error, #=3 crash on error\n"); + fprintf(stderr, " -fXs X = file format: 1=int8,2=int16,4=int32(=default), t=text:one integer per line c=text:integers separated by non digit char\n"); + fprintf(stderr, " s = sorted,S=sorted unique\n"); +// fprintf(stderr, " -z# check reading/writing outside bounds: #=1 compress, #=2 decompress, #3:both\n"); + fprintf(stderr, "Output:\n"); + fprintf(stderr, " -v# # = verbosity 0..3 (default 1)\n"); + fprintf(stderr, " -l# # = 1 : print all groups/plugins, # = 2 : print all codecs\n"); + fprintf(stderr, " -S# Plot transfer speed: #=1 Comp speedup #=2 Decomp speedup #=3 Comp 'MI/s' #=4 Decomp 'MI/s'\n"); + fprintf(stderr, " -p# #='print format' 1=text 2=html 3=htm 4=markdown 5:vBulletin 6:csv(comma) 7=tsv(tab)\n"); + fprintf(stderr, " -Q# # Plot window 0:1920x1080, 1:1600x900, 2:1280x720, 3:800x600 (default=1)\n"); + fprintf(stderr, " -g -g:no merge w/ old result 'file.tbb', -gg:process w/o output (use for fuzzing)\n"); + fprintf(stderr, " -o print on standard output\n"); + fprintf(stderr, " -G plot memcpy\n"); + fprintf(stderr, " -1 Plot Speedup linear x-axis (default log)\n"); + fprintf(stderr, " -3 Plot Ratio/Speed logarithmic x-axis (default linear)\n"); + fprintf(stderr, "Multiblock (0=number of integers followed by integer array):\n"); + fprintf(stderr, " -Moutput concatenate all input files to multiple blocks file output\n");\ + fprintf(stderr, " -r process multiple blocks per file (ex. gov2.sorted).\n"); + fprintf(stderr, "----- arg. ZIPF specified --------------\n"); fprintf(stderr, " -aF F = zipfian distribution alpha ex. -a1.0 uniform -a1.5 skewed\n"); fprintf(stderr, " -mNs N = minimum integer generated. s=(k,K, m,M, g,G\n"); fprintf(stderr, " -MNs N = maximum integer generated\n"); fprintf(stderr, " -nNs N = number of integers to generate\n"); - fprintf(stderr, "Ex. ./icbench -a1.5 -m0 -M255 -n100m\n"); - fprintf(stderr, " s = modifier s:k,m,g=(1000,1 million,1 billion) s:K,M,G=(1024,1MB,1GB) ex. 64k or 64K\n"); + fprintf(stderr, "Ex. ./icbench -a1.5 -m0 -M255 -n100m ZIPF\n"); + BEUSAGE; + fprintf(stderr, "ex. ./icbench file -eVBYTE/turbopfor ZIPF\n"); + fprintf(stderr, "ex. ./icbench -eTurboPFor -fS -r gov2.sorted\n"); exit(0); -} +} -#define TEST64 - #ifdef TEST64 -#define R64 ((unsigned long long)rand()) -#define RND64 ( (R64<<60) ^ (R64<<45) ^ (R64<<30) ^ (R64<<15) ^ (R64<<0) ) -#include - #define NN (4*1024*1024) - uint64_t in[NN],cpy[NN]; - unsigned char out[NN*9]; -void vstest64(int id, int rm,int rx, unsigned n) { fprintf(stderr,"bitpack.n=%d ", n); - unsigned b,i; - for(b = rm; b <= rx; b++) { fprintf(stderr,"\nb=%d:", b); - uint64_t start = 0; - for(i = 0; i < n; i++) - in[i] = (/*start +=*/ RND64 & (b==64?0xffffffffffffffffull:((1ull << b)-1)));//fprintf(stderr, ".%llx ", in[0]); - - unsigned char *op; - switch(id) { - case 0: op = vbenc64( in, n, out); break; - case 1: op = vsenc64( in, n, out); break; - case 2: op = bitpack64( in, n, out, b); break; - case 3: op = p4denc64( in, n, out); break; - case 4: op = efanoenc64(in, n, out, 0); break; - } - fprintf(stderr,"%d ", (int)(op-out) ); - memset(cpy, 0, sizeof(cpy)); - switch(id) { - case 0: vbdec64( out, n, cpy); break; - case 1: vsdec64( out, n, cpy); break; - case 2: bitunpack64(out, n, cpy, b); break; - case 3: p4ddec64( out, n, cpy); break; - case 4: efanodec64( out, n, cpy, 0); break; - } - for(i = 0; i < n; i++) - if(in[i] != cpy[i]) { - fprintf(stderr, "Error b=%d at '%d'", b, i); break; - } - } - exit(0); -} - #else -#define vstest64(id,rm,rx,n) +void printfile(char *finame, int xstdout, int fmt, char *rem) { + long long totinlen; + int k = plugread(plugt, finame, &totinlen); + char *p, s[256]; + if(k < 0) + die("file open error for '%s'\n", finame); + if(!k) return; + strncpy(s, finame, 255); + s[255]=0; + if((p = strrchr(s,'.')) && !strcmp(p, ".tbb")) + *p=0; + plugprts(plugt, k, s, xstdout, totinlen, fmt, rem); +} + +char *sifmt[] = {"","s","S","z"}; + #ifdef __MINGW32__ +extern int _CRT_glob = 1; #endif +int main(int argc, char* argv[]) { -#define OVD (10*MB) -int main(int argc, char *argv[]) { int r; - char fname[0x100], *cmd=NULL; - unsigned rm=0,rx=1<<29,n=0; - int mode = -1,fmt = -1; - long long rdmax = 1ull<<32; - double a = 1.5; - tm_t tx=1*1000000; - unsigned blksize = PACK_SIZE; - tminit(); - #ifdef _VARINTG8IU - VarIntG8IU(); - #endif - #ifdef _MASKEDVBYTE - simdvbyteinit(); - #endif - #ifdef _BLOSC - blosc_init(); - blosc_set_nthreads(1); - #endif - #ifdef _LZT - anscset(0); ansdset(0); - #endif - int c, digit_optind = 0, this_option_optind = optind ? optind : 1, option_index = 0; - static struct option long_options[] = { {"repeat", 0, 0, 'r'}, {0,0, 0, 0} }; + int xstdout=-1,xstdin=-1; + int recurse = 0, xplug = 0,tm_Repk=1,plot=-1,fmt=0,fno,merge=0,rprio=1; + unsigned bsize = 1u<<30, bsizex=0; + unsigned long long filenmax = 0; + char *scmd = NULL,*trans=NULL,*beb=NULL,*rem="",s[2049]; + char *_argvx[1], **argvx=_argvx; + int dfmt = 0; + + int c, digit_optind = 0; for(;;) { - if((c = getopt_long(argc, argv, "Bsha:b:c:e:f:F:H:m:n:r:R:S:t:T:X:v:M:", long_options, &option_index)) == -1) break; - switch(c) { - case 0 : printf("Option %s", long_options[option_index].name); if(optarg) printf (" with arg %s", optarg); printf ("\n"); break; + int this_option_optind = optind ? optind : 1; + int option_index = 0; + static struct option long_options[] = { + { "help", 0, 0, 'h'}, + { 0, 0, 0, 0} + }; + if((c = getopt_long(argc, argv, "1234A:a:b:B:C:d:ce:E:F:f:gGi:I:j:J:k:K:l:L:m:M:n:N:oOPp:Q:rRs:S:t:T:Uv:V:W:X:Y:Z:z", long_options, &option_index)) == -1) break; + switch(c) { + case 0: + printf("Option %s", long_options[option_index].name); + if(optarg) printf (" with arg %s", optarg); printf ("\n"); + break; case 'a': a = strtod(optarg, NULL); break; - case 'b': blksize = argtoi(optarg); if(blksize>BLK_SIZE) blksize = BLK_SIZE; break; - case 'c': mode = atoi(optarg); break; - case 'f': fmt = atoi(optarg); break; - case 'F': rdmax = argtoi(optarg); break; - case 'h': usage(); break; - case 'H': xcheck = atoi(optarg); break; - case 'e': cmd = optarg; break; - case 'v': verb = atoi(optarg); break; - - case 'S': tm_slp = atoi(optarg); break; - case 't': tm_tx = atoi(optarg)*TM_T;break; - case 'T': tm_Tx = atoi(optarg)*TM_T;break; - case 'r': tm_reps = atoi(optarg); break; - case 'R': tm_Reps = atoi(optarg); break; - - case 'n': n = argtoi(optarg); break; - case 'm': rm = argtoi(optarg); break; - case 'M': rx = argtoi(optarg); break; - case 'X': vstest64(atoi(optarg),rm,rx,n); break; - default: usage(); - } - } - int fno,i=0; - if(!tm_reps) tm_reps=tm_Reps=1; //printf("range=(min=%u, max=%u)\n", rm, rx);fflush(stdout); - // build the test functions set - struct libss *ls; - if(cmd) { - char *q = NULL; - for(i=0,libs[0].id = -1;;) { - if(cmd) { - if(!*cmd) break; - q = strchr(cmd,','); - if(q) *q=' '; - if((q = strchr(cmd,'/')) != NULL) - *q = '\0'; - for(ls = libss; ls->id >= 0; ls++) - if(!strcasecmp(ls->s, cmd)) { - memset(&libs[i], 0, sizeof(struct libs)); - libs[i].id = ls->id; - libs[i].err = 0; - libs[i].s = ls->s; - libs[i++].size = ls->size; if(verb) printf("%s/", ls->s);fflush(stdout); - break; - } - if(ls->id < 0) die("library: '%s' not found\n", cmd); - cmd = q?(q+1):""; - } - } - } else for(ls = libss; ls->id >= 0; ls++) { - libs[i].id = ls->id; - libs[i].err = 0; - libs[i].s = ls->s; if(verb) printf("%s/", ls->s);fflush(stdout); - libs[i++].size = ls->size; - } - libs[i].id = -1; if(verb) printf("\n"); - unsigned long long totlen = 0; - memcpy(slibs,libs,sizeof(slibs)); - if(argc <= optind) { // No file specified at commandline - unsigned *in, *cpy,*ip; - unsigned char *out; - - if(!n) n = 100000000; - #ifdef ALGN - unsigned *_cpy,*_in; unsigned char *_out; - _in = amalloc(n*4+OVD,64); if(!_in) die("malloc err=%u", n); in =_in +0; - _out = amalloc(n*5+OVD,64); if(!_out) die("malloc err=%u", n); out=_out+12; - _cpy = amalloc(n*4+OVD,64); if(!_cpy) die("malloc err=%u", n); cpy=_cpy+3; - #else - in = malloc(n*4+OVD); if(!in) die("malloc err=%u", n); - out = malloc(n*5+OVD); if(!out) die("malloc err=%u", n); - cpy = malloc(n*4+OVD); if(!cpy) die("malloc err=%u", n); - #endif - char s[33]; - s[0] = 0; - if(fmt == T_TST) { // Unit test for fixed bit sizes - unsigned b; printf("bittest: %u-%u\n", rm, rx); fflush(stdout); - for(b = rm; b <= max(rx,32); b++) { - libini(); sprintf(s,"b=%d", b); - for(*in = n,i = 1; i <= n; i++) - in[i] = (1ull << b)-1; - totlen = bench(in, n+1, blksize, out, n*5+OVD, s, tx, cpy, b, - #ifdef _ZIGZAG - mode - #else - -1 - #endif - ); print(libs,totlen, s, NULL); - } - } else { // Benchmark w. generated data - printf("zipf alpha=%.2f range[%u..%u].n=%u\n ", a, rm, rx, n); - *in = n; - zipfgen(in+1, a, rm, rx, n); for(i = 1; i <= n; i++) xbits[bsr32(in[i])]++; - if(mode >= 0) { stprint("delta"); - unsigned *ip = in+1; int v; - for(ip[0]=0,v = 1; v < n; v++) { - ip[v] += ip[v-1] + mode; if(ip[v]>=(1u<<31)) die("overflow generating sorted array\n" ); - } - } else stprint(""); - totlen = bench(in, n+1, blksize, out, n*5+OVD, s, tx, cpy, -1, mode); - print(libs,totlen, s, NULL); - } - #ifdef ALGN - afree(_in); afree(_cpy); afree(_out); - #else - free(in); free(cpy); free(out); - #endif - } else for(fno = optind; fno < argc; fno++) { // Benchmark w. specified data files - libini(); - char *inname = argv[fno]; - if(fmt >= T_TXT) { //------------ convert text file to integer array format - FILE *fi = fopen(inname, "r"); if(!fi) { fprintf(stderr, "open error '%s'", inname); perror(inname); exit(-1); } - unsigned *in = NULL, *cpy,*ip, nmax = 0; - unsigned char *out; + case 'b': bsize = argtol(optarg); bsizex++; break; + case 'B': filenmax = argtol(optarg); break; + case 'C': cmp = atoi(optarg); break; +// case 'c': ifmt = atoi(optarg); break; + case 'd': mdelta = atoi(optarg); break; + case 'e': scmd = optarg; break; + case 'f': { char *s = optarg; if(*s =='c') { dfmt = T_CHAR; s++;} else if(*s=='t') { dfmt = T_TXT; s++; } + else if(*s=='1' || *s=='2') { dfmt = s[0]-'0';s++;} + switch(*s) { case 's': ifmt = 0; break; case 'S': ifmt = 1;break; case 'z': ifmt = 2;break; } + } break; + case 'F': fac = strtod(optarg, NULL); break; +// case 'f': fuzz = atoi(optarg); break; + case 'g': merge++; break; + case 'G': plotmcpy++; break; - n = 1; - #define LSIZE 1024 - char s[LSIZE+1]; - switch(fmt) { - case T_TXT: { - while(fgets(s, LSIZE, fi)) { - s[strlen(s) - 1] = 0; - unsigned u = strtoul(s, NULL, 10); - if(n >= nmax) { - nmax = nmax?(nmax << 1):(1<<20); - in = realloc(in, nmax*4+OVD); if(!in) die("malloc err=%u", nmax); - } - in[n++] = u; - } - } break; - case T_CHAR: - for(;;) { - char *p = s; - int c; - while((c = getc(fi)) >= '0' && c <= '9') if(p - s < LSIZE) *p++ = c; - *p = 0; - unsigned u = strtoul(s, NULL, 10); //printf("%d,",u); - if(n >= nmax) { - nmax = nmax?(nmax << 1):(1<<20); - in = realloc(in, nmax*4+OVD); if(!in) die("malloc err=%u", nmax); - } - in[n++] = u; - if(c == EOF) break; - } - break; - case T_BYTE: { - unsigned char c; - while(fread(&c, 1, 1, fi)>0) { - if(n >= nmax) { - nmax = nmax?(nmax << 1):(1<<20); - in = realloc(in, nmax*4+OVD); if(!in) die("malloc err=%u", nmax); - } - in[n++] = c; - } - } break; - case T_BYTE2: { - unsigned short c; - while(fread(&c, sizeof(c), 1, fi)>0) { - if(n >= nmax) { - nmax = nmax?(nmax << 1):(1<<20); - in = realloc(in, nmax*4+OVD); if(!in) die("malloc err=%u", nmax); - } - in[n++] = c; - } - } break; - case T_BYTE4: { - unsigned c; - while(fread(&c, sizeof(c), 1, fi)>0) { //printf("%d,", c); - if(n >= nmax) { - nmax = nmax?(nmax << 1):(1<<20); - in = realloc(in, nmax*4+OVD); if(!in) die("malloc err=%u", nmax); - } - in[n++] = c; - } - } break; - case T_DBL: { //Test floating pint decomposition - double c,*din = NULL; n=0; - - while(fread(&c, 8, 1, fi)>0) { printf("%e\n", c); - if(!din || n >= nmax) { - nmax = nmax?(nmax << 1):(1<<20); - din = realloc(din, nmax*sizeof(c)+OVD); if(!din) die("malloc err=%u", nmax); - } - din[n++] = c; - } - double *dcpy = malloc(n*sizeof(dcpy[0])); - uint64_t *mantissa = malloc(n*sizeof(mantissa[0])); - unsigned *sign = malloc(n*sizeof(sign[0])); - unsigned *exp = malloc(n*sizeof(exp[0])); if(!mantissa || !exp || !sign || !dcpy) die("alloc error\n"); - bitdouble( din, n, exp, mantissa); - bitundouble( exp, mantissa, n, dcpy); - int i; for(i=0;i < n; i++) { printf("%d,%d,%llu,%e,%e\n", sign[i], exp[i],(long long unsigned int)mantissa[i], din[i], dcpy[i]); if(din[i]!=dcpy[i]) die("check error at %d %e %e\n", i, din[i], dcpy[i]); } - free(din); free(mantissa); free(exp); free(sign); free(dcpy); - exit(0); - } - default: die("unkown data format\n"); - } - fclose(fi); - out = malloc(n*5+OVD); if(!out) die("malloc err=%u", n); - cpy = malloc(n*4+OVD); if(!cpy) die("malloc err=%u", n); - in[0] = n-1; s[0] = 0; - unsigned long long l = bench(in, n, blksize, out, n*5+OVD, s, tx, cpy, -1, mode); - print(libs, l, inname, NULL); //printf("n=%d.%d\n", n-1,argc); - int i; for(i=0;libs[i].id>=0;i++) slibs[i].tc += libs[i].tc,slibs[i].td += libs[i].td,slibs[i].l += libs[i].l; - totlen += l; - continue; - } - //------- process integer array file ------------------ - FILE *fi = fopen64(inname, "rb"); - if(!fi) { fprintf(stderr, "open error '%s'", inname); perror(inname); exit(-1); } - fseeko(fi, 0, SEEK_END); unsigned long long fisize = ftello(fi); fseeko(fi, 0, SEEK_SET); //printf("fisize=%llu\n", fisize); - if(fisize > rdmax) fisize = rdmax; - fisize /= 4; - - unsigned *in, *cpy,*ip,num; - unsigned char *out; - unsigned long long outsize=fisize*5+OVD,totlen=0,bitslen[33]={0}; - out = malloc(outsize); if(!out) die("malloc err=%llu", fisize); - cpy = malloc(fisize*4+OVD); if(!cpy) die("malloc err=%llu", fisize); - in = malloc(fisize*4+1024); if(!in) die("malloc err=%llu", fisize); - - ip = in; - while(fread(&num, 1, 4, fi) == 4 && num) { - if(num < rm || num > rx) { fseeko(fi, num*4, SEEK_CUR); continue; } - if(ip+num > in+fisize) { - totlen += bench(in, ip-in, blksize, out, outsize, inname, tx, cpy, -1, mode); - if(n && totlen > n) - break; - ip = in; - } - *ip++ = num; if(fread(ip, 4, num, fi) != num) break; - bitslen[bsr32(num)] += num*4; - #ifdef IC_STATS - unsigned *ep = ip+num,insize=(mode>=0)?blksize+1:blksize; - while(ip < ep) { - unsigned m = min(ep-ip, insize),i; - if(mode >= 0) { - for(i = 1; i < m; i++) { - if(verb>3) printf(":%u ", ip[i]);fflush(stdout); - xbits[bsr32((ip[i] - ip[i-1]) - mode)]++; - if(ip[i] < ip[i-1]+mode) die("IDs in '%s' not sorted.[did=%u,%u] at line=%d\n", inname, ip[i], ip[i-1], (int)(ip-in)); - } - } else for(i = 0; i < m; i++) xbits[bsr32(ip[i])]++; - ip += m; - } + case 'i': if((tm_repc = atoi(optarg))<=0) + tm_repc=tm_Repc=1; break; + case 'I': tm_Repc = atoi(optarg); break; + case 'j': if((tm_repd = atoi(optarg))<=0) + tm_repd=tm_Repd=1; break; + case 'J': tm_Repd = atoi(optarg); break; + case 'k': if((tm_Repk = atoi(optarg))<=0) tm_repc=tm_Repc=tm_repd=tm_Repd=tm_Repk=1; break; + case 'K': tm_RepkT = argtot(optarg); break; + case 'L': tm_slp = atoi(optarg); break; + case 't': tm_tx = atoi(optarg)*TM_T; break; + case 'T': tm_TX = atoi(optarg)*TM_T; break; + case 'S': speedup = atoi(optarg); break; + + case 'l': xplug = atoi(optarg); break; + case 'r': mode++; break; + case 'o': xstdout++; break; + case 'p': fmt = atoi(optarg); break; + case 'P': mcpy++; break; + case 'Q': divxy = atoi(optarg); + if(divxy>3) divxy=3; break; + case 'D': rprio=0; break; + case 's': mininlen = argtoi(optarg,Mb); break; + case 'v': verbose = atoi(optarg); break; + case 'Y': seg_ans = argtoi(optarg,Kb); break; + case 'Z': seg_huf = argtoi(optarg,Kb); break; + case '1': xlog = xlog?0:1; break; + case '2': ylog = ylog?0:1; break; + case '3': xlog2 = xlog2?0:1; break; + case '4': ylog2 = ylog2?0:1; break; + #ifdef LZTURBO + case 'c': beb = optarg; break; #else - ip += num; + case 'c': fprintf(stderr, "Option M: only in binary package available"); exit(0); #endif + case 'n': n = argtoi(optarg,1); break; + case 'm': rm = argtoi(optarg,1); break; + case 'M': rx = argtoi(optarg,1); break; +// case 'z': vstest64(atoi(optarg),rm,rx,n); break; + BEOPT; + case 'h': + default: + usage(argv[0]); + exit(0); } - fclose(fi); - totlen += bench(in, ip-in, blksize, out, outsize, inname, tx, cpy, -1, mode); // printf("N#%u,%u ", (int)(ip-in), (unsigned)(totlen/1000000)); - free(in); free(cpy); free(out); - #ifdef IC_STATS - stprint("delta"); + } + if(xplug) { + xplug==1?plugsprt():plugsprtv(stdout, fmt); + exit(0); + } + + if(argc <= optind) { + #ifdef _WIN32 + setmode( fileno(stdin), O_BINARY ); #endif - print(libs,totlen,inname, bitslen); - } - if(fmt >= T_TXT && (fno-optind)>1) { printf("\n"); print(slibs,totlen, "TOT", NULL); } + argvx[0] = "stdin"; + optind = 0; + argc = 1; + recurse = 0; + } else + argvx = argv; + + if(fmt) { + if(argc <= optind) { printf("no input file specified"); exit(0); } + for(fno = optind; fno < argc; fno++) + printfile(argvx[fno], xstdout, fmt, rem); + exit(0); + } + if((tm_repc|tm_Repc|tm_repd|tm_Repd) ==1) + tm_Repk = 1; + if(rprio) { + #ifdef _WIN32 + SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS); + #else + setpriority(PRIO_PROCESS, 0, -19); + #endif + } + if(!scmd) scmd = "DEFAULT"; + for(s[0] = 0;;) { + char *q; int i; + if(!*scmd) break; + if(q = strchr(scmd,'/')) *q = '\0'; + for(i = 0; i < PLUGGSIZE; i++) + if(!strcmp(scmd, plugg[i].id)) { + strcat(s, "ON/"); + strcat(s, plugg[i].s); + strcat(s, "OFF/"); + break; + } + if(i >= PLUGGSIZE) { + strcat(s,scmd); + strcat(s,"/"); + } + scmd = q?(q+1):(char*)""; + } + unsigned k = plugreg(plug, s, 0, bsize, bsizex); + if(k > 1 && argc == 1 && !strcmp(argvx[0],"stdin")) { printf("multiple codecs not allowed when reading from stdin"); exit(0); } + + BEINI; + if(!filenmax) filenmax = Gb; + int krep; + char *finame = ""; + int pagesize = getpagesize(); + unsigned long long totinlen; + tm_t tmk0 = tminit(); + + struct plug *g; + for(g = plugt; g < plugt+k; g++) g->tc = g->td = DBL_MAX; + char sfiname[33]; + + /*if(dfmt == T_TST) { // Unit test for fixed bit sizes + unsigned b; printf("bittest: %u-%u\n", rm, rx); fflush(stdout); + for(b = rm; b <= max(rx,32); b++) { + codini(insize, p->id); + sprintf(s,"b=%d", b); + for(i = 0; i < n; i++) + in[i] = (1ull << b)-1; + outlen = plugbench(p, in, inlen, insize, out, outsize, _cpy, bsize, plugr,tid, krep, finame); + + totlen = bench(in, n+1, blksize, out, n*5+OVD, s, tx, cpy, b, + #ifdef _ZIGZAG + mode + #else + -1 + #endif + ) print(libs,totlen, s, NULL); + } + exit(0); + }*/ + for(krep = 0; krep < tm_Repk; krep++) { if(tm_Repk > 1) printf("Benchmark: %d from %d\n", krep+1, tm_Repk); + totinlen = 0; + for(g = plugt; g < plugt+k; g++) + g->len = g->tck = g->tdk = 0; // BEFILE; + + for(fno = optind; fno < argc; fno++) { + finame = argvx[fno]; if(verbose > 1) printf("%s\n", finame); + FILE *fi = NULL; + if(!strcmp(finame,"ZIPF")) { sprintf(sfiname, "ZIPF%.2f_%u-%u", a,rm,rx); + strcat(sfiname,sifmt[ifmt+1]); + finame=sfiname; if(!dfmt) dfmt=T_UINT32; + } else { fi = strcmp(finame,"stdin")?fopen(finame, "rb"):stdin; if(!fi) { perror(finame); die("open error '%s'\n", finame); } } + + char *q; + if((q = strrchr(finame, '\\')) || (q = strrchr(finame, '/'))) finame = q+1; if(verbose>1) printf("'%s'\n", finame); + + size_t outsize,insize; + unsigned char *_in = NULL; + int inlen; + if(dfmt) { + if(!_in) { filen = inlen = befgen(&_in, dfmt, 4, fi); + insize = min(filen,(1u< filenmax) filen = filenmax; + } else + filen = filenmax; + insize = filen; if(insize > filenmax) insize = filenmax; + insize = min(filen,(1u<lev >= 0) + sprintf(name, "%s %d%s", p->s, p->lev, p->prm); + else + sprintf(name, "%s%s", p->s, p->prm); + + codini(insize, p->id); + long long outlen=0; + double ptc = DBL_MAX, ptd = DBL_MAX; + bsize = p->blksize; + p->len = p->tc = p->td = 0; blknum = 0; + + if(dfmt) { + ftotinlen = inlen; memrcpy(out, _in, inlen); + outlen = plugbench(p, _in, inlen, insize, out, outsize, _cpy, bsize, plugr,tid, krep, finame); + } else { + ftotinlen = 0; + fseek(fi, 0, SEEK_SET); + while((inlen = fread(_in, 1, insize, fi)) > 0) { memrcpy(out, _in, inlen); + if(mode) { + unsigned char *ip,*e = _in+inlen; + for(ip = _in;;) { + unsigned l = ctou32(ip); //printf("%d:", l); + if(ip+4+l*4 > e) { + fseek(fi, -(e - ip), SEEK_CUR); //printf("%d,", -(e-ip)); + break; + } + ip += 4+l*4; + } + inlen = ip - _in; + } + ftotinlen += inlen; + outlen += plugbench(p, _in, inlen, insize, out, outsize, _cpy, bsize, plugr,tid, krep, finame); + if(ftotinlen >= filen) break; + } + } + g->len += outlen; + g->tck += p->tc; + g->tdk += p->td; + g->err = g->err?g->err:p->err; + if(p->memc > g->memc) g->memc = p->memc; + if(p->memd > g->memd) g->memd = p->memd; + g->s = p->s; + g->lev = p->lev; + strcpy(g->prm, p->prm); + g->id = p->id; + codexit(p->id); if(verbose && filen > insize) plugprt(p, totinlen, finame, FMT_TEXT, &ptc, &ptd,stdout); + } + totinlen += ftotinlen; + if(fi) fclose(fi); + dfmt?free(_in):_vfree(_in, insizem); _in = NULL; + _vfree(out, outsize); + if(_cpy && _cpy != _in) + _vfree(_cpy, insizem); + } + for(g = plugt; g < plugt+k; g++) { + if(g->tck < g->tc) g->tc = g->tck; + if(g->tdk < g->td) g->td = g->tdk; //g->totinlen += plugfile(fi, p, finame, filenmax, bsize, plugr, tid, krep); + } + //if(tmtime() - tmk0 > tm_RepkT) break; + } + BENCHSTA; + + if(argc - optind > 1) { + unsigned clen = strpref(&argvx[optind], argc-optind, '\\', '/'); + strncpy(s, argvx[optind], clen); + if(clen && (s[clen-1] == '/' || s[clen-1] == '\\')) + clen--; + s[clen] = 0; + finame = strrchr(s,'/'); + if(!finame) + finame = strrchr(s, '\\'); + if(!finame) + finame = s; + else finame++; + } else { + char *p; + if((p = strrchr(finame, '\\')) || (p = strrchr(finame, '/'))) + finame = p+1; + } + if(!strcmp(finame,"ZIPF")) finame = sfiname; + sprintf(s, "%s.tbb", finame); + if(merge /*|| tm_repc <= 1 || tm_repd <= 1*/) { + if(merge == 1) + plugprts(plugt, k, s, 1, totinlen, FMT_TEXT, rem); + exit(0); + } + + long long _totinlen; + int gk = plugread(plug, s, &_totinlen); + if(_totinlen != totinlen) + gk = 0; + FILE *fo = fopen(s, "w"); + if(fo) { + char tms[20]; + time_t tm; + time(&tm); + struct tm *ltm = localtime(&tm); + sprintf(tms, "%.4d-%.2d-%.2d.%.2d:%.2d:%.2d", 1900 + ltm->tm_year, ltm->tm_mon+1, ltm->tm_mday, ltm->tm_hour, ltm->tm_min, ltm->tm_sec); + + struct plug *g,*p; + fprintf(fo, "dataset\tsize\tcsize\tdtime\tctime\tcodec\tlevel\tparam\tcmem\tdmem\ttime\n"); + for(p = plugt; p < plugt+k; p++) { + for(g = plug; g < plug+gk; g++) + if(g->id >= 0 && !strcmp(g->s, p->s) && g->lev == p->lev && !strcmp(g->prm, p->prm)) { + if(g->len == p->len) { + int u=0; + if(g->td < p->td || p->td < 0.01) + p->td = g->td,u++; + if(g->tc < p->tc || p->tc < 0.01) + p->tc = g->tc,u++; + + if(g->memd != p->memd) u++; + if(g->memc != p->memc) u++; + strcpy(p->tms, u?tms:g->tms); //printf("(%lld %lld) ", g->memc, g->memd);printf("(%lld %lld) ", p->memc, p->memd); + } + g->id = -1; + break; + } + fprintf(fo, "%s\t%"PRId64"\t%"PRId64"\t%.6f\t%.6f\t%s\t%d\t%s\t%"PRId64"\t%"PRId64"\t%s\n", finame, totinlen, p->len, p->td, p->tc, p->s, p->lev, p->prm[0]?p->prm:"?", p->memc, p->memd, p->tms[0]?p->tms:tms); + } + for(g = plug; g < plug+gk; g++) + if(g->id >= 0) + fprintf(fo, "%s\t%"PRId64"\t%"PRId64"\t%.6f\t%.6f\t%s\t%d\t%s\t%"PRId64"\t%"PRId64"\t%s\n", finame, totinlen, g->len, g->td, g->tc, g->s, g->lev, g->prm[0]?g->prm:"?", g->memc, g->memd, g->tms[0]?g->tms:tms); + fclose(fo); + printfile(s, 0, FMT_TEXT, rem); + } } + diff --git a/idx.h b/idx.h index 12dd921..6f1e287 100644 --- a/idx.h +++ b/idx.h @@ -1,5 +1,5 @@ /** - Copyright (C) powturbo 2013-2015 + Copyright (C) powturbo 2013-2017 GPL v2 License This program is free software; you can redistribute it and/or modify @@ -47,7 +47,6 @@ //-------------------------- Mapping term id <-> posting offset in file ---------------------------------- typedef struct { uint8_t offseth; uint32_t offsetl; } __attribute__ ((packed)) tmap_t; // 40 bits offsets -> 1 Terabyte -#define TIDMAPSET(__t, __ofs) { (__t)->offseth = (__ofs)>>32; (__t)->offsetl = (__ofs) & 0xffffffff; } -#define TIDMAPGET(__t) ((__off64_t)(__t)->offseth << 32 | (__t)->offsetl) -#define TIDMAP(__fdm, __tid) ({ unsigned char *_bp = __fdm; tmap_t *_t = (tmap_t *)&_bp[(__tid)*sizeof(tmap_t)]; TIDMAPGET(_t); }) - +#define TIDMAPSET(_t_, _ofs_) { (_t_)->offseth = (_ofs_)>>32; (_t_)->offsetl = (_ofs_) & 0xffffffff; } +#define TIDMAPGET(_t_) ((__off64_t)(_t_)->offseth << 32 | (_t_)->offsetl) +#define TIDMAP(_fdm_, _tid_) ({ unsigned char *_bp = _fdm_; tmap_t *_t = (tmap_t *)&_bp[(_tid_)*sizeof(tmap_t)]; TIDMAPGET(_t); }) diff --git a/idxcr.c b/idxcr.c index 95890c1..f9846ad 100644 --- a/idxcr.c +++ b/idxcr.c @@ -1,5 +1,5 @@ /** - Copyright (C) powturbo 2013-2015 + Copyright (C) powturbo 2013-2017 GPL v2 License This program is free software; you can redistribute it and/or modify @@ -35,13 +35,9 @@ #include "conf.h" #include "vint.h" -#include "vp4dc.h" +#include "vp4c.h" #include "bitpack.h" #include "idx.h" - - #if defined(__APPLE__) -#define fopen64(a,b) fopen(a,b) - #endif //--------------------------------------------------------------------------------------------------------------- #define DELTA( __in, __n, __b) do { unsigned _v; for(__b=0,_v = __n-1; _v > 0; --_v) __in[_v] = (__in[_v] - __in[_v-1]) - 1, __b |= __in[_v]; __b = bsr32(__b); } while(0) @@ -49,7 +45,7 @@ int verb; void usage() { - fprintf(stderr, "\nTurboPFor Copyright (c) 2013-2015 Powturbo %s\n", __DATE__); + fprintf(stderr, "\nTurboPFor Copyright (c) 2013-2017 Powturbo %s\n", __DATE__); fprintf(stderr, "https://github.com/powturbo/TurboPFor\n\n"); fprintf(stderr, "Create inverted index from 'Document identifier data set' format\n"); fprintf(stderr, "See http://lemire.me/data/integercompression2014.html'\n"); @@ -116,7 +112,7 @@ int main(int argc, char *argv[]) { if(n > 1) { DELTA(ip, n, b); //bitdelta32( in+1, --n, pa, in[0], mode); #ifdef _TURBOPFOR - b = p4d32(ip+1, n-1, &bx); + b = _p4dec32(ip+1, n-1, &bx); #endif } #ifdef SKIP_S @@ -139,9 +135,9 @@ int main(int argc, char *argv[]) { #endif #ifdef _TURBOPFOR *op++ = bx; - op = n==129?p4dev32( ip+1, n-1, op, b, bx):p4de32( ip+1, n-1, op, b, bx); + op = n==129?_p4dec128v32( ip+1, n-1, op, b, bx):_p4dec32( ip+1, n-1, op, b, bx); #else - op = n==129?bitpackv32( ip+1, n-1, op, b) :bitpack32(ip+1, n-1, op, b); + op = n==129?bitpack128v32( ip+1, op, b) :bitpack32(ip+1, n-1, op, b); #endif } ip += n; diff --git a/idxqry.c b/idxqry.c index f2909a2..dbce724 100644 --- a/idxqry.c +++ b/idxqry.c @@ -1,5 +1,5 @@ /** - Copyright (C) powturbo 2013-2015 + Copyright (C) powturbo 2013-2017 GPL v2 License This program is free software; you can redistribute it and/or modify @@ -45,7 +45,7 @@ #include "conf.h" #include "vint.h" #include "bitunpack.h" -#include "vp4dd.h" +#include "vp4d.h" #include "idx.h" //#define STATS @@ -126,7 +126,7 @@ typedef struct { int postinit( post_t *v, int tid, idxrd_t *idx, unsigned *dids) { unsigned long long o = TIDMAP(idx->fdm, tid); if(!o) return 0; unsigned char *p = idx->fdp + o; // start of posting; - v->f_t = vbget32(p); // num docs + vbget32(p, v->f_t); // num docs v->didno = v->bno = -1; v->bnum = (v->f_t+BLK_DIDNUM-1)/BLK_DIDNUM; // num blocks v->_f_t = v->f_t; @@ -149,7 +149,7 @@ static ALWAYS_INLINE unsigned postdec(post_t *v, int bno, unsigned *dids) { if(v p = v->p + pix[v->bnum]; // o=offset to posting block dids[0] = *pix; // first did in block v->didnum = bno < v->bnum-1?BLK_DIDNUM:v->f_t - bno*BLK_DIDNUM; - } else { v->didnum = v->f_t; dids[0] = vbget32(p); } STAT(st_dec += v->didnum); STAT(st_decs[st_terms] += v->didnum); + } else { v->didnum = v->f_t; vbget32(p, dids[0]); } STAT(st_dec += v->didnum); STAT(st_decs[st_terms] += v->didnum); #ifdef SKIP_S unsigned b = dids[0] & SKIP_M; dids[0] >>= SKIP_S; #endif @@ -160,9 +160,9 @@ static ALWAYS_INLINE unsigned postdec(post_t *v, int bno, unsigned *dids) { if(v #endif #ifdef _TURBOPFOR unsigned bx = *p++; - p = v->didnum == 129?p4dd1dv32( p, v->didnum-1, &dids[1], dids[0], b, bx):p4dd1d32( p, v->didnum-1, &dids[1], dids[0], b, bx); + p = v->didnum == 129?p4dd1d128v32( p, v->didnum-1, &dids[1], dids[0], b, bx):p4dd1d32( p, v->didnum-1, &dids[1], dids[0], b, bx); #else - p = v->didnum == 129?bitd1unpackv32( p, v->didnum-1, &dids[1], dids[0], b ):bitd1unpack32(p, v->didnum-1, &dids[1], dids[0], b); + p = v->didnum == 129?bitd1unpack128v32( p, &dids[1], dids[0], b ):bitd1unpack32(p, v->didnum-1, &dids[1], dids[0], b); #endif } v->didno = bno; @@ -196,7 +196,7 @@ static ALWAYS_INLINE unsigned postnext(post_t *v, unsigned *dids) { unsigned *pix = (unsigned *)p + v->bno; p = v->p + pix[v->bnum]; // o=offset to posting block dids[0] = *pix; // first did in block - } else dids[0] = vbget32(p); + } else vbget32(p, dids[0]); #ifdef SKIP_S unsigned b = dids[0] & SKIP_M; dids[0] >>= SKIP_S; #endif @@ -209,9 +209,9 @@ static ALWAYS_INLINE unsigned postnext(post_t *v, unsigned *dids) { #endif #ifdef _TURBOPFOR unsigned bx = *p++; - p = v->didnum == 129?p4dd1dv32( p, v->didnum-1, &dids[1], dids[0], b, bx):p4dd1d32( p, v->didnum-1, &dids[1], dids[0], b, bx); + p = v->didnum == 129?p4dd1d128v32( p, v->didnum-1, &dids[1], dids[0], b, bx):p4dd1d32( p, v->didnum-1, &dids[1], dids[0], b, bx); #else - p = v->didnum == 129?bitd1unpackv32( p, v->didnum-1, &dids[1], dids[0], b ):bitd1unpack32(p, v->didnum-1, &dids[1], dids[0], b); + p = v->didnum == 129?bitd1unpack128v32( p, &dids[1], dids[0], b ):bitd1unpack32(p, v->didnum-1, &dids[1], dids[0], b); #endif } dids[v->didnum] = INT_MAX; @@ -247,7 +247,7 @@ static ALWAYS_INLINE unsigned postget(post_t *v, unsigned did, unsigned *dids) { #endif } else { p = v->bp; - v->did = vbget32(p); + vbget32(p, v->did); v->ldid = UINT_MAX; } #ifdef SKIP_S @@ -261,9 +261,9 @@ static ALWAYS_INLINE unsigned postget(post_t *v, unsigned did, unsigned *dids) { #endif #ifdef _TURBOPFOR unsigned bx = *p++; - p = v->didnum == 129?p4dd1dv32( p, v->didnum-1, &dids[1], dids[0], b, bx):p4dd1d32( p, v->didnum-1, &dids[1], dids[0], b, bx); + p = v->didnum == 129?p4dd1d128v32( p, v->didnum-1, &dids[1], dids[0], b, bx):p4dd1d32( p, v->didnum-1, &dids[1], dids[0], b, bx); #else - p = v->didnum == 129?bitd1unpackv32( p, v->didnum-1, &dids[1], dids[0], b ):bitd1unpack32(p, v->didnum-1, &dids[1], dids[0], b); + p = v->didnum == 129?bitd1unpack128v32( p, &dids[1], dids[0], b ):bitd1unpack32(p, v->didnum-1, &dids[1], dids[0], b); #endif } dids[v->didnum] = v->ldid&INT_MAX; v->didno = 0; goto a; @@ -377,7 +377,7 @@ unsigned qrysearch(qry_t *q, idxrd_t *idx) { #ifdef SKIP_INTERVALS unsigned *_xd = dids[0], xdnum; unsigned *_yd = dids[1], ydnum; - if(v[0].f_t > BLK_DIDNUM) { + if(v[0].f_t > BLK_DIDNUM) { unsigned *_x = (unsigned *)v[0].bp, *x_ = _x+v[0].bnum, *x = _x, *xd; unsigned *_y = (unsigned *)v[1].bp, *y_ = _y+v[1].bnum, *y = _y, *yd; _xd[0] = _yd[0] = UINT_MAX; @@ -536,7 +536,7 @@ int qrybatch(idxrd_t *idx, char *fqname } void usage() { - fprintf(stderr, "\nTurboPFor Copyright (c) 2013-2015 Powturbo %s\n", __DATE__); + fprintf(stderr, "\nTurboPFor Copyright (c) 2013-2017 Powturbo %s\n", __DATE__); fprintf(stderr, "https://github.com/powturbo/TurboPFor\n\n"); #ifdef THREAD_MAX fprintf(stderr, "Benchmark: parallel intersections in compressed inverted index\n\n"); diff --git a/idxseg.c b/idxseg.c index f83d350..b2357dc 100644 --- a/idxseg.c +++ b/idxseg.c @@ -1,5 +1,5 @@ /** - Copyright (C) powturbo 2013-2015 + Copyright (C) powturbo 2013-2017 GPL v2 License This program is free software; you can redistribute it and/or modify @@ -21,7 +21,7 @@ - twitter : https://twitter.com/powturbo - email : powturbo [_AT_] gmail [_DOT_] com **/ -// idxseg.c - Inverted Index - Create partitions from DocId file for prallel query evaluation +// idxseg.c - Inverted Index - Create partitions from DocId file for prallel query evaluation #define _LARGEFILE64_SOURCE 1 #define _FILE_OFFSET_BITS 64 #include @@ -45,7 +45,7 @@ unsigned argtoi(char *s) { } void usage() { - fprintf(stderr, "\nTurboPFor Copyright (c) 2013-2015 Powturbo %s\n", __DATE__); + fprintf(stderr, "\nTurboPFor Copyright (c) 2013-2017 Powturbo %s\n", __DATE__); fprintf(stderr, "Partitioning\n"); fprintf(stderr, "Usage: idxseg -nNs -sPs \n"); fprintf(stderr, "Ns=total number of documents. Ps=number of partitions\n"); diff --git a/jic.c b/jic.c index 128a7b8..a788bb2 100644 --- a/jic.c +++ b/jic.c @@ -1,5 +1,5 @@ /** - Copyright (C) powturbo 2013-2015 + Copyright (C) powturbo 2013-2017 GPL v2 License This program is free software; you can redistribute it and/or modify @@ -119,3 +119,4 @@ JNIEXPORT jint JNICALL Java_jic_bitd32( JNIEnv *env, jclass cls, jintArr JNIEXPORT jint JNICALL Java_jic_bitd132( JNIEnv *env, jclass cls, jintArray _in, jint n, jint start ) { JNIBITS(bitd132, _in, n, start ); } JNIEXPORT jint JNICALL JavaCritical_jic_bitd132( jint i, jint *in, jint n, jint start ) { return bitd132( (unsigned *)in, n, start ); } + diff --git a/makefile b/makefile index 9f3cd9e..59983c2 100644 --- a/makefile +++ b/makefile @@ -1,93 +1,206 @@ -# powturbo (c) Copyright 2013-2015 -# Linux: "export CC=clang" windows mingw: "set CC=gcc" or uncomment one of following lines -# CC=clang -# CC=gcc +# powturbo (c) Copyright 2013-2016 +# ----------- Downloading + Compiling ---------------------- +# git clone --recursive git://github.com/powturbo/TurboPFor.git +# make +# +# Minimum make: "make NCODEC2=1 NCODEC3=1 NTRANFORM=1" to compile only TurboPFor +# Linux: "export CC=clang" "export CXX=clang". windows mingw: "set CC=gcc" "set CXX=g++" or uncomment the CC,CXX lines +CC ?= gcc +CXX ?= g++ +#CC=clang +#CXX=clang++ + +DDEBUG=-DNDEBUG -s MARCH=-march=native -#MARCH=-msse2 -CFLAGS=-DNDEBUG -fstrict-aliasing -m64 $(MARCH) -Iext - -UNAME := $(shell uname) -ifeq ($(UNAME), Linux) -LIBTHREAD=-lpthread -LIBRT=-lrt +#MARCH=-march=broadwell +ifeq ($(AVX2),1) +MARCH+=-mavx2 -mbmi2 else -CC=gcc +#NAVX2=0 +AVX2=0 +DEFS+=-DNAVX2 endif -BIT=./ +#---------------------------------------------- +ifeq ($(OS),Windows_NT) + UNAME := Windows +CC=gcc +CXX=g++ +CFLAGS+=-D__int64_t=int64_t +else + UNAME := $(shell uname -s) +ifeq ($(UNAME),$(filter $(UNAME),Linux Darwin FreeBSD GNU/kFreeBSD)) +LDFLAGS+=-lpthread -lrt +endif +endif + +ifeq ($(STATIC),1) +LDFLAGS+=-static +NMEMSIZE=1 +endif + +LBITS := $(shell getconf LONG_BIT) +ifeq ($(LBITS),64) +ARCH=64 +else +ifeq ($(ARCH),32) +CFLAGS=-fomit-frame-pointer +else +ARCH=64 +endif +endif + +#---------------------- make args -------------------------- +ifeq ($(NCODEC1),1) +DEFS+=-DNCODEC1 +else +NCODEC1=0 +endif + +ifeq ($(NCODEC2),1) +DEFS+=-DNCODEC2 +else +NCODEC2=0 +endif + +ifeq ($(NTRANFORM),1) +DEFS+=-DNTRANSFORM +else +NTRANSFORM=0 +endif + +ifeq ($(LZTURBO),1) +DEFS+=-DLZTURBO +endif + +#------------- +# disable peak memory calculation +ifeq ($(NMEMSIZE),1) +DEFS+=-DNMEMSIZE +else +ifeq ($(UNAME),$(filter $(UNAME),Linux Darwin FreeBSD GNU/kFreeBSD)) +LDFLAGS += -ldl +endif +endif + +CFLAGS+=-w -Wall -DNDEBUG -DUSE_THREADS -fstrict-aliasing -Iext -Iext/lz4/lib -Iext/simdcomp/include -Iext/MaskedVByte/include -Iext/LittleIntPacker/include -Iext/streamvbyte/include $(DEFS) +CXXFLAGS+=$(DDEBUG) $(MARCH) -std=gnu++11 -w -fpermissive -Wall -fno-rtti $(DEFS) -Iext/FastPFor/headers + all: icbench idxcr idxqry idxseg -bitpack.o: $(BIT)bitpack.c $(BIT)bitpack.h $(BIT)bitpack64_.h - $(CC) -O2 $(CFLAGS) -c $(BIT)bitpack.c +cpp: vp4c.c + $(CC) $(MARCH) -E vp4c.c -bitpackv.o: $(BIT)bitpackv.c $(BIT)bitpack.h $(BIT)bitpackv32_.h - $(CC) -O2 $(CFLAGS) -c $(BIT)bitpackv.c - -vp4dc.o: $(BIT)vp4dc.c - $(CC) -O3 $(CFLAGS) -funroll-loops -c $(BIT)vp4dc.c -vp4dd.o: $(BIT)vp4dd.c - $(CC) -O3 $(CFLAGS) -funroll-loops -c $(BIT)vp4dd.c +bitpack.o: bitpack.c bitpack.h bitpack64_.h + $(CC) -O2 $(CFLAGS) $(MARCH) -c bitpack.c -varintg8iu.o: $(BIT)ext/varintg8iu.c $(BIT)ext/varintg8iu.h - $(CC) -O2 $(CFLAGS) -c -funroll-loops -std=c99 $(BIT)ext/varintg8iu.c +varintg8iu.o: ext/varintg8iu.c ext/varintg8iu.h + $(CC) -O2 $(CFLAGS) $(MARCH) -c -funroll-loops -std=c99 ext/varintg8iu.c -idxqryp.o: $(BIT)idxqry.c - $(CC) -O3 $(CFLAGS) -c $(BIT)idxqry.c -o idxqryp.o +idxqryp.o: idxqry.c + $(CC) -O3 $(CFLAGS) -c idxqry.c -o idxqryp.o -SIMDCOMPD=ext/simdcomp/ -SIMDCOMP=$(SIMDCOMPD)bitpacka.o $(SIMDCOMPD)src/simdintegratedbitpacking.o $(SIMDCOMPD)src/simdcomputil.o $(SIMDCOMPD)src/simdbitpacking.o +vsimple.o: vsimple.c + $(CC) -O2 $(CFLAGS) $(MARCH) -c vsimple.c -#LIBFOR=ext/for/for.o -MVB=ext/MaskedVByte/src/varintencode.o ext/MaskedVByte/src/varintdecode.o -QMX=ext/qmx/compress_qmx.o -# Lzturbo not included -#LZT=../lz/lz8c0.o ../lz/lz8d.o ../lz/lzbc0.o ../lz/lzbd.o +#------------------------------------------------------------------- +ifeq ($(NCODEC1), 0) +OB+=ext/streamvbyte/src/streamvbyte.o ext/streamvbyte/src/streamvbytedelta.o +OB+=ext/MaskedVByte/src/varintencode.o ext/MaskedVByte/src/varintdecode.o +OB+=ext/simdcomp/src/simdintegratedbitpacking.o ext/simdcomp/src/simdcomputil.o ext/simdcomp/src/simdbitpacking.o ext/simdcomp/src/simdpackedselect.o +OB+=ext/simdcomp_/simdfor.o -# blosc. Set the env. variable "EXT=blosc" to include -#EXT=blosc -ifeq ($(EXT), blosc) -B=ext/ -CFLAGS+=-DSHUFFLE_SSE2_ENABLED -DHAVE_LZ4 -DHAVE_ZLIB -Iext/ -LFLAGS+=-lpthread -BLOSC=$(B)lz4hc.o $(B)c-blosc/blosc/blosc.o $(B)c-blosc/blosc/blosclz.o $(B)c-blosc/blosc/shuffle.o $(B)c-blosc/blosc/shuffle-generic.o $(B)c-blosc/blosc/shuffle-sse2.o +ifeq ($(AVX2),1) +OB+=ext/simdcomp/src/avxbitpacking.o endif -LZ4=ext/lz4.o +OB+=ext/LittleIntPacker/src/bitpacking32.o ext/LittleIntPacker/src/turbobitpacking32.o ext/LittleIntPacker/src/scpacking32.o ext/LittleIntPacker/src/horizontalpacking32.o +ifeq ($(AVX2),1) +OB+=ext/LittleIntPacker/src/bmipacking32.o +endif -#ZLIB=-lz +OB+=ext/libfor/for.o +#modified QMX for unaligned SIMD load/store +OB+=ext/bench_/bench/compress_qmx.o ext/bench_/bench/compress_qmx_v2.o ext/bench_/bench/compress_qmx_v3.o ext/bench_/bench/compress_qmx_v4.o +#OB+=ext/qmx.o +#OB+=ext/qmx/compress_qmx.o +OB+=ext/varintg8iu.o +OB+=ext/rc.o +endif -#BSHUFFLE=ext/bitshuffle/src/bitshuffle.o +#---------------------------------------- +ifeq ($(NCODEC2), 0) +ext/polycom/optpfd.o: ext/polycom/optpfd.c + $(CC) -O2 $(MARCH) $(CFLAGS) $< -c -o $@ -OBJS=icbench.o bitutil.o vint.o bitpack.o bitunpack.o eliasfano.o vsimple.o vp4dd.o vp4dc.o varintg8iu.o bitpackv.o bitunpackv.o $(TRANSP) ext/simple8b.o transpose.o $(BLOSC) $(SIMDCOMP) $(LIBFOR) $(QMX) $(LZT) $(LZ4) $(MVB) $(ZLIB) $(BSHUFFLE) +OB+=ext/polycom/optpfd.o +OB+=ext/polycom/polyvbyte.o -icbench: $(OBJS) - $(CXX) $(OBJS) -lm -o icbench $(LFLAGS) +OB+=ext/FastPFor/src/bitpacking.o ext/FastPFor/src/simdbitpacking.o ext/FastPFor/src/simdunalignedbitpacking.o -idxseg: idxseg.o - $(CC) idxseg.o -o idxseg +ifeq ($(HAVE_ZLIB), 1) +CDEFS+=-DZLIB +ifeq ($(STATIC),1) +OB+=/usr/lib/x86_64-linux-gnu/libz.a +else +OB+=-lz +endif +else +#ZD=zlib/ +#OB+=$(ZD)adler32.o $(ZD)crc32.o $(ZD)compress.o $(ZD)deflate.o $(ZD)infback.o $(ZD)inffast.o $(ZD)inflate.o $(ZD)inftrees.o $(ZD)trees.o $(ZD)uncompr.o $(ZD)zutil.o +endif + +OB+=ext/lz4/lib/lz4hc.o ext/lz4/lib/lz4.o +OB+=ext/bitshuffle/src/bitshuffle.o ext/bitshuffle/src/iochain.o ext/bitshuffle/src/bitshuffle_core.o + +ifeq ($(BLOSC),1) +LDFLAGS+=-lpthread +CFLAGS+=-Iext/ -DSHUFFLE_SSE2_ENABLED +OB+=ext/c-blosc2/blosc/blosc.o ext/c-blosc2/blosc/blosclz.o ext/c-blosc2/blosc/schunk.o ext/c-blosc2/blosc/delta.o ext/c-blosc2/blosc/shuffle.o ext/c-blosc2/blosc/shuffle-generic.o ext/c-blosc2/blosc/shuffle-sse2.o \ +ext/c-blosc2/blosc/bitshuffle-generic.o ext/c-blosc2/blosc/bitshuffle-sse2.o +endif + +endif + +OB+=bitutil.o vint.o bitpack.o bitunpack.o eliasfano.o vsimple.o vp4d.o vp4c.o bitpack128v.o bitunpack128v.o bitunpack128h.o $(TRANSP) ext/simple8b.o transpose.o + +#-------------------------------------------------------------------------------------------------------------------------------------------------------------- +ICLIB=bitpack.o bitunpack.o bitunpack128v.o vint.o vp4d.o bitutil.o bitpack128v.o + +ifeq ($(AVX2),1) +OB+=bitpack256v.o bitunpack256v.o +ICLIB+=bitpack256v.o bitunpack256v.o +endif + + +icbench: $(OB) icbench.o plugins.o + $(CXX) $^ $(LDFLAGS) -o icbench + +.c.o: + $(CC) -O3 $(MARCH) $(CFLAGS) $< -c -o $@ + +.cc.o: + $(CXX) -O3 $(MARCH) $(CXXFLAGS) $< -c -o $@ + +.cpp.o: + $(CXX) -O3 $(MARCH) $(CXXFLAGS) $< -c -o $@ + +idxseg: idxseg.o $(ICLIB) + $(CC) $^ $(LDFLAGS) -o idxseg ifeq ($(UNAME), Linux) para: CFLAGS += -DTHREADMAX=32 para: idxqry endif -idxcr: idxcr.o bitpack.o vp4dc.o bitutil.o - $(CC) idxcr.o bitpack.o bitpackv.o vp4dc.o bitutil.o -o idxcr $(LFLAGS) +idxcr: idxcr.o $(ICLIB) + $(CC) $^ $(LDFLAGS) -o idxcr $(LFLAGS) -idxqry: idxqry.o bitunpack.o vp4dd.o bitunpackv.o bitutil.o - $(CC) idxqry.o bitunpack.o bitunpackv.o vp4dd.o bitutil.o $(LIBTHREAD) $(LIBRT) -o idxqry $(LFLAGS) +idxqry: idxqry.o $(ICLIB) + $(CC) $^ $(LDFLAGS) $(LIBTHREAD) $(LIBRT) -o idxqry $(LFLAGS) -.c.o: - $(CC) -O3 $(CFLAGS) $< -c -o $@ - -.cc.o: - $(CXX) -O3 -DNDEBUG $(MARCH) $< -c -o $@ - -.cpp.o: - $(CXX) -O3 -DNDEBUG $< -c -o $@ - clean: @find . -type f -name "*\.o" -delete -or -name "*\~" -delete -or -name "core" -delete diff --git a/plugins.cc b/plugins.cc new file mode 100644 index 0000000..54d5784 --- /dev/null +++ b/plugins.cc @@ -0,0 +1,1153 @@ +/** + Copyright (C) powturbo 2013-2017 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - homepage : https://sites.google.com/site/powturbo/ + - github : https://github.com/powturbo + - twitter : https://twitter.com/powturbo + - email : powturbo [_AT_] gmail [_DOT_] com +**/ +#define _LARGEFILE64_SOURCE 1 +#define _FILE_OFFSET_BITS 64 +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "plugins.h" + + #ifdef NCODEC0 +#define CODEC0 0 + #else +#define CODEC0 1 + #endif + + #ifdef NCODEC1 +#define CODEC1 0 + #else +#define CODEC1 1 + #endif + + #ifdef NCODEC2 +#define CODEC2 0 + #else +#define CODEC2 0 + #endif + + #ifdef NTRANSFORM +#define TRANSFORM 0 + #else +#define TRANSFORM 1 + #endif + +enum { +#define C_MEMCPY 1 + P_LMCPY, // must be 0 + P_MCPY, // must be 1 + P_COPY, + +#define C_TURBOPFOR CODEC0 + TB_PFOR128, + TB_PFOR256, + TB_PFORDA, // actually not working w. mingw + TB_FOR, + TB_FORDA, + TB_PACK128H, + TB_PACK128V, + TB_PACK256H, + TB_PACK256V, + TB_PACK, + TB_VBYTE, + TB_VSIMPLE, + TB_ELIASFANO, + TB_VS_S1, + TB_VS_S4, + TB_VS_S8, + +#define C_BITSHUFFLE CODEC1 + P_BITSHUFFLE, +#define C_C_BLOSC CODEC2 + BS_LZ, + BS_LZ4, + BS_ZLIB, + BS_LZ_1, + BS_LZ4_S1, + BS_LZ4_1, + BS_ZLIB_1, + BS_SHUFFLE, +#define C_FASTPFOR CODEC1 + FP_VBYTE, + FP_FASTPFOR, + FP_SIMDFASTPFOR, + FP_SIMDOPTPFOR, + FP_OPTPFOR, + FP_SIMPLE8BRLE, + FP_SIMDPACK, +#define C_LIBFOR CODEC1 + LF_FOR, + LF_FORX, +#define C_LITTLEPACK CODEC1 + LI_PACK, + LI_BMIPACK, + LI_TURBOPACK, + LI_HORPACK, + LI_SCPACK, +#define C_LZ4 CODEC1 + LZ4_, + LZ4_S1, + LZ4_S4, + LZ4_S8, +#define C_MASKEDVBYTE CODEC1 + P_MASKEDVBYTE, +#define C_POLYCOM CODEC1 + PC_OPTPFD, // compression too slow and limited to 28 bits. crashs on some lists + PC_VBYTE, + PC_RICE, + PC_SIMPLE16, // limited to 28 bits. +#define C_QMX CODEC1 + P_QMX,P_QMX2,P_QMX3,P_QMX4, +#define C_SIMDCOMP CODEC1 + SC_PACK, + SC_SIMDPACK128, + SC_SIMDPACK256, + SC_FOR, + SC_FORDA, +#define C_SIMPLE8B CODEC1 // //crash on integers w. size 32 bits ! + AM_SIMPLE8B, +#define C_STREAMVBYTE CODEC1 + P_STREAMVBYTE, +#define C_VARINTG8IU CODEC1 + P_VARINTG8IU, + #ifdef ZLIB +#define C_ZLIB CODEC2 + #else +#define C_ZLIB 0 + #endif + P_ZLIB, +#define C_TRANSFORM CODEC0 + TB_TP8_32, + TB_TP8V_32, + TB_TP4V_32, + TB_ZIGZAG_32, + TB_DELTA_32, +#define C_LZTURBO CODEC2 + P_LZT, + P_VSHUF, + + P_MAX +}; + + #ifndef __SSSE3__ +#define C_VINTG8IU 0 +#define C_MASKEDVBYTE 0 + #endif + #if C_SIMPLE8B +#include "ext/simple8b.h" // optimized simple-8b by powturbo + #endif + + +//------------------------------------------------------------------------------- + #if C_C_BLOSC +#include "ext/c-blosc2/blosc/shuffle.h" +#include "ext/c-blosc2/blosc/blosc.h" + #endif + + #if C_FASTPFOR +#include "ext/FastPFor/headers/variablebyte.h" +#include "ext/FastPFor/headers/simple16.h" +#include "ext/FastPFor/headers/simple8b_rle.h" + +#include "ext/FastPFor/headers/fastpfor.h" +#include "ext/FastPFor/headers/simdfastpfor.h" +#include "ext/FastPFor/headers/optpfor.h" +#include "ext/FastPFor/headers/simdoptpfor.h" +//#include "ext/FastPFor/headers/compositecodec.h" + #endif + + #if C_LIBFOR +#include "ext/libfor/for.h" +unsigned char *for_selectx( unsigned char *__restrict in, unsigned n, unsigned *__restrict out) { unsigned b = in[4], i; for(i = 0; i < n; i++) out[i] = for_select(in, i); return in + 5 + for_compressed_size_bits(n, b); } + #endif + + #if C_LZ4 +#include "ext/lz4/lib/lz4.h" +#include "ext/lz4/lib/lz4hc.h" + #endif + +// #if C_SIMDCOMPLIB +//#include "ext/SIMDCompressionAndIntersection/include/bitpacking.h" +// #endif + + #if C_POLYCOM +#include "ext/vas16c.h" // Simple 16 +#include "ext/vas16d.h" +#include "ext/rc.h" +#include "ext/polycom/optpfd.h" +#include "ext/polycom/polyvbyte.h" + #endif + + #if C_QMX +//#include "ext/qmx/compress_qmx.h" +//#include "ext/qmx.h" +#include "ext/bench_/bench/compress_qmx.h" +#include "ext/bench_/bench/compress_qmx_v2.h" +#include "ext/bench_/bench/compress_qmx_v3.h" +#include "ext/bench_/bench/compress_qmx_v4.h" + #endif + + #if C_TURBOPFOR +#include "vint.h" +#include "vsimple.h" +#include "bitpack.h" +#include "bitunpack.h" +#include "vp4c.h" +#include "vp4d.h" +#include "eliasfano.h" +#include "bitutil.h" +#include "transpose.h" + #endif + + #if C_ZLIB +#include + #endif + +#ifdef __cplusplus +extern "C" { +#endif + + #if C_STREAMVBYTE +#include "ext/streamvbyte/include/streamvbyte.h" +#include "ext/streamvbyte/include/streamvbytedelta.h" +#undef VARINTDECODE_H_ + + #endif + + #if C_MASKEDVBYTE +#include "ext/MaskedVByte/include/varintencode.h" + #undef VARINTDECODE_H_ +#include "ext/MaskedVByte/include/varintdecode.h" + #endif + + #if C_VARINTG8IU +#include "ext/varintg8iu.h" // SIMD Varint G8IU + #endif + #if C_LITTLEPACK +#include "ext/LittleIntPacker/include/bitpacking.h" +#include "ext/LittleIntPacker/include/util.h" + #endif + +#include "ext/simdcomp/include/simdbitpacking.h" + +#ifdef __cplusplus +} +#endif + + #if C_SIMDCOMP +#undef SIMDBITPACKING_H_ +#include "ext/vabyte.h" // Standard Variable Byte +#include "ext/simdcomp/include/simdcomp.h" +unsigned char *simdpackwithoutmaskd1n(uint32_t *in, uint32_t n, uint32_t *out, uint32_t start, uint32_t b) {//checkifdivisibleby(n, 128); const uint32_t * const initout(out); //while(needPaddingTo128Bits(out)) *out++ = 123456; + uint32_t *ip; + for(ip = in; ip != in+(n&~(128-1)); ip += 128,out += 4 * b) + simdpackwithoutmaskd1(start, ip, (__m128i *)out, b); + return (unsigned char *)out; +} + +unsigned char *simdunpackd1n(uint32_t *in, uint32_t n, uint32_t *out, uint32_t start, uint32_t b) { + uint32_t k, *op; + for(op = out; op != out+(n&~(128-1)); op += 128,in += 4 * b) + simdunpackd1(start, (__m128i *)in, out, b); + return (unsigned char *)in; +} + +/*unsigned char *simdfor_selectx( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { + unsigned i; + for(i=0; i < n;i++) out[i] = simdselectFOR(start, (const __m128i *)in, b, i); return in + simdpackFOR_compressedbytes(n, b); + }*/ + + #ifdef __AVX2__ +unsigned char *avxpackwithoutmaskn(uint32_t *in, uint32_t n, uint32_t *out, uint32_t b) { + uint32_t *ip; + for(ip = in; ip != in+(n&~(256-1)); ip += 256,out += 8 * b) + avxpackwithoutmask(ip, (__m256i *)out, b); + return (unsigned char *)simdpack_shortlength(ip, n & (256-1), (__m128i *)out, b); +} +unsigned char *avxunpackn(uint32_t *in, uint32_t n, uint32_t *out, uint32_t b) { + uint32_t k, *op; + for(op = out; op != out+(n&~(256-1)); op += 256,in += 8 * b) + avxunpack((__m256i *)in, op, b); + return (unsigned char *)simdunpack_shortlength((__m128i *)in, n & (256-1), op, b); +} + #endif + #endif + + #if C_LZTURBO +#include "../../lz/lz8.h" +int lz8c0( struct lzobj *lz); +int lz8c01(struct lzobj *lz); +int lz8d( struct lzobj *lz); + +#include "../../lz/lzb.h" +int lzbc0( struct lzobj *lz); +int lzbc01(struct lzobj *lz); +int lzbc2( struct lzobj *lz); +int lzbd( struct lzobj *lz); + +//#include "../../lz/lzh.h" +int lzhc2( struct lzobj *lz); +int lzhd( struct lzobj *lz); +int prunelen = 0x200, ans_seg = 12*1024; +#include "../../lz/anst.h" +#include "../../lz/ans.h" +#include "../../lz/mh.h" +#define MH_BLK (16*1024) + #endif + + #if C_BITSHUFFLE +#define __STDC_VERSION__ 199901L +#include "ext/bitshuffle/src/bitshuffle.h" + #endif + +#define BLK_SIZE (64*1024*4) +#define BLK_V128 (128*4) +#define BLK_V256 (256*4) + +unsigned char sbuf[BLK_SIZE*2+64]; + +//------------------------------------------------- registry ------------------------------------------------------------------------------------------------- +struct plugs plugs[] = { + { TB_PFOR128, "TurboPFor", C_TURBOPFOR, "", "TurboPFor", "", "https://github.com/powturbo/TurboPFor", "", 0,BLK_V128 }, + { TB_PFOR256, "TurboPFor256", C_TURBOPFOR, "", "TurboPFor", "", "https://github.com/powturbo/TurboPFor", "", 0,BLK_V256 }, + { TB_PFORDA, "TurboPForDA", C_TURBOPFOR, "", "TurboPFor", "", "https://github.com/powturbo/TurboPFor", "", 0,BLK_V128 }, + { TB_FOR, "TurboFor", C_TURBOPFOR, "", "TurboPFor", "", "https://github.com/powturbo/TurboPFor", "", 0,BLK_V128 }, + { TB_FORDA, "TurboForDA", C_TURBOPFOR, "", "TurboPFor", "", "https://github.com/powturbo/TurboPFor", "", 0,BLK_V128 }, + { TB_PACK128V, "TurboPackV", C_TURBOPFOR, "", "TurboPFor", "", "https://github.com/powturbo/TurboPFor", "", 0,BLK_V128 }, +// { TB_PACK128H, "TurboPackH", C_TURBOPFOR, "", "TurboPFor", "", "https://github.com/powturbo/TurboPFor", "", 0,BLK_V128 }, + { TB_PACK256V, "TurboPack256V", C_TURBOPFOR, "", "TurboPFor", "", "https://github.com/powturbo/TurboPFor", "", 0,BLK_V256 }, + { TB_PACK256H, "TurboPack256H", C_TURBOPFOR, "", "TurboPFor", "", "https://github.com/powturbo/TurboPFor", "", 0,BLK_V256 }, + { TB_PACK, "TurboPack", C_TURBOPFOR, "", "TurboPFor", "", "https://github.com/powturbo/TurboPFor", "", 0,BLK_V128 }, + { TB_VBYTE, "TurboVByte", C_TURBOPFOR, "", "TurboPFor", "", "https://github.com/powturbo/TurboPFor", "" }, + { TB_VSIMPLE, "VSimple", C_TURBOPFOR, "", "TurboPFor", "", "https://github.com/powturbo/TurboPFor", "" }, + { TB_ELIASFANO, "EliasFano", C_TURBOPFOR, "", "TurboPFor", "", "https://github.com/powturbo/TurboPFor", "" }, + + { TB_VS_S1, "vs_bitshuffle", C_LZ4, "", "TurboPFor", "", "https://github.com/Cyan4973/lz4", "", 0,BLK_SIZE }, + { TB_VS_S4, "vs_s4", C_LZ4, "", "TurboPFor", "", "https://github.com/Cyan4973/lz4", "", 0,BLK_SIZE }, + { TB_VS_S8, "vs_s8", C_LZ4, "", "TurboPFor", "", "https://github.com/Cyan4973/lz4", "", 0,BLK_SIZE }, + + { TB_TP8_32, "TP8s_32", C_TURBOPFOR, "", "TurboPFor", "", "https://github.com/powturbo/TurboPFor", "", 0,BLK_SIZE }, + { TB_TP8V_32, "TP8_32", C_TURBOPFOR, "", "TurboPFor", "", "https://github.com/powturbo/TurboPFor", "", 0,BLK_SIZE }, + { TB_TP4V_32, "TP4_32", C_TURBOPFOR, "", "TurboPFor", "", "https://github.com/powturbo/TurboPFor", "", 0,BLK_SIZE }, + { TB_ZIGZAG_32, "ZigZag_32", C_TURBOPFOR, "", "TurboPFor", "", "https://github.com/powturbo/TurboPFor", "", 0,BLK_SIZE }, + { TB_DELTA_32, "Delta_32", C_TURBOPFOR, "", "TurboPFor", "", "https://github.com/powturbo/TurboPFor", "", 0,BLK_SIZE }, + + { AM_SIMPLE8B, "Simple8b", C_SIMPLE8B, "", "Simple-8b optimized", "", "", "", 0,1024 }, + + { PC_SIMPLE16, "PC.Simple16", C_POLYCOM, "", "Polycom", "", "https://github.com/encode84/bcm", "", 0,BLK_V128 }, + { PC_OPTPFD, "PC.OptPFD", C_POLYCOM, "", "Polycom", "", "", "", 0,BLK_V128}, + { PC_VBYTE, "PC.Vbyte", C_POLYCOM, "", "Polycom", "", "https://github.com/jibsen/brieflz", "" }, + { PC_RICE, "PC.Rice", C_POLYCOM, "", "Polycom optimized", "", "https://github.com/jibsen/brieflz", "", 0,BLK_V128}, + + { P_VARINTG8IU, "VarintG8IU", C_VARINTG8IU, "", "VarintG8IU", "Apache license", "", ""}, + { P_MASKEDVBYTE, "MaskedVbyte", C_MASKEDVBYTE, "", "MaskedVbyte", "Apache license", "http://maskedvbyte.org", ""}, + { P_STREAMVBYTE, "StreamVbyte", C_STREAMVBYTE, "", "StreamVbyte", "Apache license", "", ""}, + + { FP_FASTPFOR, "FP.FastPFor", C_FASTPFOR, "", "FastPFor", "", "", "", 0,BLK_SIZE}, + { FP_SIMDFASTPFOR,"FP.SimdFastPFor", C_FASTPFOR, "", "FastPFor", "", "", "", 0,BLK_SIZE}, + { FP_OPTPFOR, "FP.OptPFor", C_FASTPFOR, "", "FastPFor", "", "", "", 0}, + { FP_SIMDOPTPFOR, "FP.SIMDOptPFor", C_FASTPFOR, "", "FastPFor", "", "", "", 0}, + { FP_VBYTE, "FP.VByte", C_FASTPFOR, "", "FastPFor", "", "", "", 0,BLK_SIZE}, + { FP_SIMPLE8BRLE, "FP.Simple8bRLE", C_FASTPFOR, "", "FastPFor", "", "", "", 0,BLK_SIZE}, + + { SC_SIMDPACK128, "SC.SIMDPack128", C_SIMDCOMP, "", "Simdcomp", "", "", "", 0,BLK_V128}, + { SC_SIMDPACK256, "SC.SIMDPack256", C_SIMDCOMP, "", "Simdcomp", "", "", "", 0,BLK_V256}, + { SC_FOR, "SC.For", C_SIMDCOMP, "", "Simdcomp", "", "", "", 0,BLK_V128}, + { SC_FORDA, "SC.ForDA", C_SIMDCOMP, "", "Simdcomp", "", "", "", 0,BLK_V128}, + +// { CL_FASTPFORD1, "CL.SIMDPFORD1", C_SIMDCOMP, "", "Simdcomp", "", "", "", 0,BLK_V128}, + + { LF_FOR, "LibFor.For", C_SIMDCOMP, "", "LibFor", "", "", "", 0,BLK_V128}, + { LF_FORX, "LibFor.ForDA", C_SIMDCOMP, "", "LibFor", "", "", "", 0,BLK_V128}, + + { LI_PACK, "LI.Pack", C_LITTLEPACK, "", "LittlePack", "", "", "", 0,BLK_V128}, + { LI_TURBOPACK, "LI.TurboPack", C_LITTLEPACK, "", "LittlePack", "", "", "", 0,BLK_V128}, + { LI_SCPACK, "LI.SuperPack", C_LITTLEPACK, "", "LittlePack", "", "", "", 0,BLK_V128}, + { LI_HORPACK, "LI.HorPack", C_LITTLEPACK, "", "LittlePack", "", "", "", 0,BLK_V128}, + { LI_BMIPACK, "LI.BMIPack256", C_LITTLEPACK, "", "LittlePack", "", "", "", 0,BLK_V128}, + + { P_QMX, "qmx", C_QMX, "", "QMX", "https://bitbucket.org/andrewtrotman/bench.git", "", "", 0,BLK_V128}, + { P_QMX2, "qmx2", C_QMX, "", "QMX", "https://bitbucket.org/andrewtrotman/bench.git", "", "", 0,BLK_V128}, + { P_QMX3, "qmx3", C_QMX, "", "QMX", "https://bitbucket.org/andrewtrotman/bench.git", "", "", 0,BLK_V128}, + { P_QMX4, "qmx4", C_QMX, "", "QMX", "https://bitbucket.org/andrewtrotman/bench.git", "", "", 0,BLK_V128}, + + { P_LZT, "LzTurbo", C_LZTURBO, "", "LzTurbo", "", "https://sites.google.com/site/powturbo", "20,21,22,32", 0,BLK_SIZE }, + { P_VSHUF, "VSimpleANS", C_LZTURBO, "", "LzTurbo", "", "https://sites.google.com/site/powturbo", "20,21,22,32", 0,BLK_SIZE }, + { LZ4_, "lz4", C_LZ4, "", "Lz4", "BSD license", "https://github.com/Cyan4973/lz4", "" }, + { LZ4_S1, "lz4_bitshufle", C_LZ4, "", "Lz4", "BSD license", "https://github.com/Cyan4973/lz4", "", 0,BLK_SIZE }, + { LZ4_S4, "lz4_tp4", C_LZ4, "", "Lz4", "BSD license", "https://github.com/Cyan4973/lz4", "", 0,BLK_SIZE }, + { LZ4_S8, "lz4_tp8", C_LZ4, "", "Lz4", "BSD license", "https://github.com/Cyan4973/lz4", "", 0,BLK_SIZE }, + { P_ZLIB, "zlib", C_ZLIB, "1.2.8","zlib", "zlib license", "http://zlib.net\thttps://github.com/madler/zlib", "", 0,BLK_SIZE }, +// { P_ZSTD, "zstd", C_ZSTD, "1.0.0","ZSTD", "BSD license+Patents","https://github.com/facebook/zstd", "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22" }, + //----------------------------------------------------------------------------------- + { P_MCPY, "memcpy", C_MEMCPY, ".", "memcpy", "------------", "--------------------------------------", "" }, + { P_COPY, "copy", C_MEMCPY, ".", "copy", "", "", "" }, + //----- Transform -------------- + + { P_BITSHUFFLE, "BitShuffle", C_TURBOPFOR, "", "BitShuffle", "", "", "", 0,BLK_SIZE }, + { BS_SHUFFLE, "Blosc_Shuffle", C_C_BLOSC, "", "Blosc", "", "https://github.com/Blosc/c-blosc2", "", 0,BLK_SIZE }, + +//{ P_MYCODEC, "mycodec", C_MYCODEC, "0", "My codec", " ", "", "" }, + #ifdef LZTURBO +// #include "../beplugr.h" + #endif + { -1 } +}; +//----------------------------------- +#define VBPUT32(a,b) vbxput32(a,b) +#define VBGET32(a,b) vbxget32(a,b) + +#define _TRANSFORM // Transform functions : transpose,zigzag +#define _TP_BITS 1 // transpose bits (4,8,16,32,64) + +//--------------------------------------- TurboPFor ---------------------------- + #if C_BITSHUFFLE +#define BITSHUFFLE(in,n,out) bshuf_bitshuffle(in, out, (n)/4, 4, 0); memcpy((char *)out+((n)&(~31)),(char *)in+((n)&(~31)),(n)&31) +#define BITUNSHUFFLE(in,n,out) bshuf_bitunshuffle(in, out, (n)/4, 4, 0);memcpy((char *)out+((n)&(~31)),(char *)in+((n)&(~31)),(n)&31) + #endif + + #if _TP_BITS == 16 +#define TRANSPOSE(in,n,out) transpose2(in,n,out) +#define UNTRANSPOSE(in,n,out) untranspose2(in,n,out) + #elif _TP_BITS == 64 +#define TRANSPOSE(in,n,out) transpose8(in,n,out) +#define UNTRANSPOSE(in,n,out) untranspose8(in,n,out) + #elif _TP_BITS == 128 +#define TRANSPOSE(in,n,out) transpose16(in,n,out) +#define UNTRANSPOSE(in,n,out) untranspose16(in,n,out) + #else +#define TRANSPOSE(in,n,out) transpose4(in,n,out) +#define UNTRANSPOSE(in,n,out) untranspose4(in,n,out) + #endif + +#define PAD8(__x) (((__x)+7)/8) + +unsigned char *u32enc(unsigned *__restrict in, int n, unsigned *__restrict out) { unsigned *ip; + #if 0 + memcpy(out,in,n*4); return (unsigned char *)(out+n); + #else + for(ip = in; ip != in+(n&~3); ) { + *out++ = *ip++; + *out++ = *ip++; + *out++ = *ip++; + *out++ = *ip++; + } + while(ip < in+n) *out++ = *ip++; + return (unsigned char *)out; + #endif +} + +unsigned char *u32dec(unsigned *__restrict in, int n, unsigned *__restrict out) { unsigned *op; + #if 0 + memcpy(out,in,n*4); return (unsigned char *)(in+n); + #else + for(op = out; op != out+(n&~3); ) { + *op++ = *in++; + *op++ = *in++; + *op++ = *in++; + *op++ = *in++; + } + while(op < out+n) *op++ = *in++; + return (unsigned char *)in; + #endif +} + +unsigned char *_bitunpackx32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out , unsigned b) { unsigned i,k=0; for(i=0; i < n; i++,k+=b ) *out++ = _bitgetx32(in, k, b); return in + PAD8(n*b); } + +// direct access functions included for demonstration only. Use the bulk functions instead, if you are decompressing most of the values +unsigned char *bitf1unpackx32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out, int start, unsigned b) { int i; for(i = 0; i < n; i++) out[i] = bitgetx32(in, i, b)+start+i+1; return in + PAD8(n*b); } +unsigned char *bitfunpackx32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, int start, unsigned b) { int i; for(i = 0; i < n; i++) out[i] = bitgetx32(in, i, b)+start; return in + PAD8(n*b); } +//---------------------------------------------- plugins -------------------------------------------------------- +#include "conf.h" + #ifndef max +#define max(x,y) (((x)>(y)) ? (x) : (y)) + #endif + +void libmemcpy(unsigned char *dst, unsigned char *src, int len) { + void *(*memcpy_ptr)(void *, const void *, size_t) = memcpy; + if (time(NULL) == 1) + memcpy_ptr = NULL; + memcpy_ptr(dst, src, len); +} + +int codini(size_t insize, int codec) { + switch(codec) { + #if C_C_BLOSC2 + case BS_LZ4: blosc_init(); blosc_set_nthreads(1); blosc_set_compressor(BLOSC_LZ4_COMPNAME);break; + case BS_ZLIB: blosc_init(); blosc_set_nthreads(1); blosc_set_compressor(BLOSC_ZLIB_COMPNAME); break; + case BS_LZ: blosc_init(); blosc_set_nthreads(1); blosc_set_compressor(BLOSC_BLOSCLZ_COMPNAME); break; + #endif + #if C_VARINTG8IU + case P_VARINTG8IU: VarIntG8IU(); + #endif + #ifdef LZTURBO + // #include "../beplug0.h" + #endif + } +} + +void codexit(int codec) {} + +//-------------- Sorted integer array : Delta/Differential compression (mode=0 increasing, mode=1 strictly increasing sequence) --------------- +#define VBPUT32(a,b) { unsigned _xx= b; vbxput32(a,_xx);} +#define VBGET32(a,b) vbxget32(a,b) + +unsigned char *codcomps(unsigned char *_in, unsigned _n, unsigned char *out, int outsize, int codec, int lev, char *prm, int inc) { + unsigned *in = (unsigned *)_in, n = (_n+3) / 4,i; //for(i = 1; i < n; i++) if(in[i] < in[i-1]+inc) die("IDs not sorted %d:%d,%d\n", i, in[i-1], in[i] ); + + unsigned pa[BLK_SIZE+2048],x,b; + + switch(codec&0x3f) { + //----------- copy --------------------------------------------------------------------------------------------------------- + case P_COPY: return u32enc( in, n, (unsigned *)out); + case P_MCPY: memcpy(out, _in, _n); return out+_n; + + case TB_VBYTE: VBPUT32(out, in[0]); + return inc?vbd1enc32(in+1, n-1, out, in[0]):vbdenc32(in+1, n-1, out, in[0]); + case TB_VSIMPLE: bitdelta32( in+1, --n, pa, in[0], inc); VBPUT32(out, in[0]); return vsenc32( pa, n, out); + case TB_ELIASFANO: x = *in++; VBPUT32(out, x); --n; + if(inc) { return n == 128?efano1enc128v32(in, n, out, x+1):efano1enc32(in, n, out, x+1); } + else { return n == 128?efanoenc128v32( in, n, out, x ):efanoenc32( in, n, out, x ); } + + case TB_PFOR128: bitdelta32( in+1, --n, pa, in[0], inc); VBPUT32(out, in[0]); return n == 128?p4enc128v32(pa, n, out):p4enc32(pa, n, out); + #ifdef __AVX2__ + case TB_PFOR256: bitdelta32( in+1, --n, pa, in[0], inc); VBPUT32(out, in[0]); return n == 256?p4enc256v32(pa, n, out):p4enc32(pa, n, out); + #endif + case TB_PFORDA: DELTR( in, n, inc, pa); VBPUT32(out, in[0]); return p4encx32( pa+1, n-1, out); + case TB_FOR : + case TB_FORDA: if(inc) { b = bitf132(in+1, --n, in[0]); VBPUT32(out, in[0]); *out++=b; return bitf1pack32(in+1, n, out, in[0], b); } + else { b = bitf32( in+1, --n, in[0]); VBPUT32(out, in[0]); *out++=b; return bitfpack32( in+1, n, out, in[0], b); } + + case TB_PACK128H: + case TB_PACK: if(inc) { b = bitd132(in+1, --n, in[0]); VBPUT32(out, in[0]); *out++=b; return bitd1pack32(in+1, n, out, in[0], b); } + else { b = bitd32( in+1, --n, in[0]); VBPUT32(out, in[0]); *out++=b; return bitdpack32( in+1, n, out, in[0], b); } + case TB_PACK128V: if(inc) { b = bitd132(in+1, --n, in[0]); VBPUT32(out, in[0]); *out++=b; return n < 128?bitd1pack32(in+1, n, out, in[0], b):bitd1pack128v32(in+1, out, in[0], b); } + else { b = bitd32( in+1, --n, in[0]); VBPUT32(out, in[0]); *out++=b; return n < 128?bitdpack32( in+1, n, out, in[0], b):bitdpack128v32( in+1, out, in[0], b); } + + #ifdef __AVX2__ + case TB_PACK256V: b = bitdelta32( in+1, --n, pa, in[0], inc); VBPUT32(out, in[0]); *out++=b; return n == 256?bitpack256v32(pa, out,b):bitpack32(pa, n, out,b); + //case TB_PACK256V: if(inc) { b = bitd132(in+1, --n, in[0]); VBPUT32(out, in[0]); *out++=b; return n < 256?bitd1pack32(in+1, n, out, in[0], b):bitd1pack256v32(in+1, out, in[0], b); } + // else { b = bitd32( in+1, --n, in[0]); VBPUT32(out, in[0]); *out++=b; return n < 256?bitdpack32( in+1, n, out, in[0], b):bitdpack256v32( in+1, out, in[0], b); } + #endif + case AM_SIMPLE8B: b = bitdelta32( in+1, --n, pa, in[0], inc); VBPUT32(out, in[0]); if(b>28) die("simple-8b overflow.bits size>28\n"); + return vs8benc( pa, n, out); + #if C_FASTPFOR + case FP_VBYTE: bitdelta32( in, n, pa, -inc, inc); return vbyteenc( pa, n, (unsigned *)out); + #endif + + #if C_MASKEDVBYTE + case P_MASKEDVBYTE: return out+vbyte_encode_delta(in, n, out, 0); + #endif + + #ifdef C_LIBFOR + case LF_FORX: //b = bitf32( in+1, --n, in[0]); *(unsigned *)out = in[0]; out+= 4; *out++=b; return out + for_compress_bits( in+1, out, n, in[0], b); //if(b < 0) b = maxbits(in), *out++ = b; return out + for_compress_bits(in, out, n, 0, b); + case LF_FOR: return out + for_compress_sorted(in, out, n); + #endif + + #if C_POLYCOM + case PC_VBYTE: bitdelta32( in, n, pa, -inc, inc); return vbpolyenc(pa, n, out); + case PC_SIMPLE16: b = bitdelta32( in+1, --n, pa, in[0], inc); VBPUT32(out, in[0]); if(b>28) die("simple16 overflow.bits size>28\n"); + return vs16enc( pa, n, (unsigned *)out); + case PC_OPTPFD: b = bitdelta32( in+1, --n, pa, in[0], inc); VBPUT32(out, in[0]); if(b>28) die("optp4 overflow.bits size>28\n"); + return optpfdenc32(pa, n, out); + /*b = bitdelta32( in+1, --n, pa, in[0], inc); VBPUT32(out, in[0]); if(b>28) die("optp4 overflow.bits size>28\n"); + if(n < 128) return vbyteenc( pa, n, (unsigned *)out); + else { return out + OPT4(pa, n, (unsigned *)out); }*/ + #endif + + #if C_QMX + //case P_QMX: { bitdelta32( in+1, --n, pa, in[0], inc); VBPUT32(out, in[0]); unsigned char *q = qmx_enc(pa, n, out+4); *(unsigned *)out = q - (out+4); return q; } + #endif + + #if C_QMX + case P_QMX: { bitdelta32( in+1, --n, pa, in[0], inc); VBPUT32(out, in[0]); ANT_compress_qmx qmx; unsigned r=qmx.compress(out+4, outsize, (uint32_t *)pa, (uint64_t)n); ctou32(out)=r; return out+4+r; } + case P_QMX2: { bitdelta32( in+1, --n, pa, in[0], inc); VBPUT32(out, in[0]); ANT_compress_qmx_v2 qmx; unsigned r=qmx.compress(out+4, outsize, (uint32_t *)pa, (uint64_t)n); ctou32(out)=r; return out+4+r; } + case P_QMX3: { bitdelta32( in+1, --n, pa, in[0], inc); VBPUT32(out, in[0]); ANT_compress_qmx_v3 qmx; unsigned r=qmx.compress(out+4, outsize, (uint32_t *)pa, (uint64_t)n); ctou32(out)=r; return out+4+r; } + case P_QMX4: { bitdelta32( in+1, --n, pa, in[0], inc); VBPUT32(out, in[0]); ANT_compress_qmx_v4 qmx; unsigned r=qmx.compress(out+4, outsize, (uint32_t *)pa, (uint64_t)n); ctou32(out)=r; return out+4+r; } + #endif + + + #if C_SIMDCOMP + case SC_FOR: + case SC_FORDA: b = bitf32( in+1, --n, in[0]); VBPUT32(out, in[0]); *out++=b; if(n<128) return (unsigned char *)simdpackFOR_length(in[0], (in+1), n, (__m128i *)out, b); + case SC_SIMDPACK128: + if(n < 129) { bitdelta32( in, n, pa, -inc, inc); return vbyteenc((unsigned *)pa, n, (unsigned *)out); } + else { b = simdmaxbitsd1(in[0], in+1); VBPUT32(out, in[0]); *out++=b; return simdpackwithoutmaskd1n(in+1, n-1, (unsigned *)out, in[0], b); } + #endif else { simdpackFOR(in[0], (in+1), (__m128i *)out, b); return out + simdpackFOR_compressedbytes(n, b); } + + #if C_STREAMVBYTE + case P_STREAMVBYTE: return out + streamvbyte_delta_encode(in, n, out, inc); + #endif + + #if C_VARINTG8IU + case P_VARINTG8IU: bitdelta32( in, n, pa, -inc, inc); return vintg8enc(pa, n, out); + #endif + // --------- transpose + lz77 ------------------------------------------------------------------------------------------------ + #if C_TRANSFORM + case TB_ZIGZAG_32: b = bitzigzag32(in, n, (unsigned *)out, 0); return out + n*4; + case TB_TP8_32: bitdelta32(in, n, (unsigned *)sbuf, -inc, inc); _transpose4( (unsigned char *)sbuf, n*4, out); return out + n*4; + case TB_TP8V_32: bitdelta32(in, n, (unsigned *)sbuf, -inc, inc); TRANSPOSE( (unsigned char *)sbuf, n*4, out); return out + n*4; + case TB_TP4V_32: bitdelta32(in, n, (unsigned *)sbuf, -inc, inc); transposen4( (unsigned char *)sbuf, n*4, out); return out + n*4; + case TB_DELTA_32: bitdelta32(in, n, (unsigned *)out, -inc, inc); return out + n*4; + #endif + + // --------- delta + transpose + lz77 ---------------------------------------------------------------------------------------- + #if C_LZTURBO + case P_LZT10:{ bitdelta32(in, n, (unsigned *)out, -inc, inc); TRANSPOSE((unsigned char *)out, n*4, sbuf); struct lzobj lz; lz.srclen = n*4; lz.src = sbuf; lz.dst = out; lz.dstlen = n*4; lz.level = 0; lz.hbits = 16; return out + lz8c01(&lz); } + case P_LZT20:{ bitdelta32(in, n, (unsigned *)out, -inc, inc); TRANSPOSE((unsigned char *)out, n*4, sbuf); struct lzobj lz; lz.srclen = n*4; lz.src = sbuf; lz.dst = out; lz.dstlen = n*4; lz.level = 0; lz.hbits = 16; return out + lzbc01(&lz); } + case P_LZT22:{ bitdelta32(in, n, (unsigned *)out, -inc, inc); TRANSPOSE((unsigned char *)out, n*4, sbuf); struct lzobj lz; lz.srclen = n*4; lz.src = sbuf; lz.dst = out; lz.dstlen = n*4; lz.level = 2; lz.hbits = 26; return out + lzbc2(&lz); } + case P_VSHUF:{ bitdelta32(in, n, (unsigned *)out, -inc, inc); TRANSPOSE((unsigned char *)out, n*4, sbuf); unsigned char *p = mheenc(sbuf, n*4, MH_BLK, out+4, out+n*4+MH_BLK); ctou32(out) = p-(out+4); return p; } + //case P_VSHUF:{ bitdelta32( in+1, --n, pa, in[0], inc); VBPUT32(out, in[0]); unsigned char *p=vsenc32(pa, n, sbuf); p = mheenc(sbuf, p-sbuf, MH_BLK, out+4, out+n*4+1024); ctou32(out) = p-(out+4); return p; } + #endif + + #if C_C_BLOSC + case BS_LZ: + case BS_LZ4: + case BS_ZLIB: return out + blosc_compress(1/*clevel*/, BLOSC_SHUFFLE, 4/*typesize*/, n*4, in, out, n*4+BLOSC_MAX_OVERHEAD); + case BS_LZ_1: + case BS_LZ4_1: + case BS_ZLIB_1: return out + blosc_compress(1/*clevel*/, BLOSC_DELTA, 4/*typesize*/, n*4, in, out, n*4+BLOSC_MAX_OVERHEAD); + #endif + + #if C_LZ4 + #if C_BITSHUFFLE + case LZ4_S1: { bitdelta32(in, n, (unsigned *)out, -inc, inc); BITSHUFFLE((unsigned char *)out, n*4, sbuf); return out + LZ4_compress((char *)sbuf, (char *)out, n*4); } // bshuf_bitshuffle(out, sbuf, n*4/32, 32, 0); + #endif + case LZ4_S4: { bitdelta32(in, n, (unsigned *)out, -inc, inc); transposen4((unsigned char *)out, n*4, sbuf); return out + LZ4_compress((char *)sbuf, (char *)out, n*4); } // bshuf_bitshuffle(out, sbuf, n*4/32, 32, 0); + case LZ4_S8: { bitdelta32(in, n, (unsigned *)out, -inc, inc); TRANSPOSE((unsigned char *)out, n*4, sbuf); return out + LZ4_compress((char *)sbuf, (char *)out, n*4); } // bshuf_bitshuffle(out, sbuf, n*4/32, 32, 0); + #endif + + #if C_ZLIB + case P_ZLIB1: case P_ZLIB2: case P_ZLIB3: case P_ZLIB4: case P_ZLIB5: case P_ZLIB6: case P_ZLIB7: case P_ZLIB8: case P_ZLIB9: + { bitdelta32(in, n, (unsigned *)out, -inc, inc); TRANSPOSE((unsigned char *)out, n*4, sbuf); uLongf outlen = n*4; int rc = compress2(out+4, &outlen, sbuf, n*4, codec-P_ZLIB1+1); if(rc != Z_OK) die("zlib compress2 rc=%d\n", rc); *(unsigned *)out = outlen; return out + 4 + outlen; } + #endif + case P_MAX ... 63: break; + } + return out; +} + +unsigned char *coddecomps(unsigned char *in, unsigned _n, unsigned char *_out, int outlen, int codec, int lev, char *prm, int inc) { + unsigned *out = (unsigned *)_out, n = (outlen+3) / 4,x,b; + switch(codec&0x3f) { + //------------- copy ------------------------------------------------------- + case P_COPY: return u32dec( (unsigned *)in, n, out); + case P_MCPY: memcpy(_out, in, _n); return _out+_n; + + case TB_ELIASFANO:VBGET32(in, x); *out++ = x; --n; + if(inc) { return n==128?efano1dec128v32(in, n, out, x+1 ):efano1dec32( in, n, out, x+1); } + else { return n==128?efanodec128v32( in, n, out, x ):efanodec32( in, n, out, x); } + case TB_PFOR128: VBGET32(in, x); *out++ = x; --n; //__builtin_prefetch(in+256); + if(inc) { return n==128?p4d1dec128v32( in, n, out, x ):p4d1dec32(in, n, out, x); } + else { return n==128?p4ddec128v32( in, n, out, x ):p4ddec32( in, n, out, x); } + #ifdef __AVX2__ + case TB_PFOR256: VBGET32(in, x); *out++ = x; --n; //__builtin_prefetch(in+256); + if(inc) { return n==256?p4d1dec256v32( in, n, out, x ):p4d1dec32(in, n, out, x); } + else { return n==256?p4ddec256v32( in, n, out, x ):p4ddec32( in, n, out, x); } + #endif + case TB_PFORDA: VBGET32(in, x);*out = x; return inc?p4fdecx32( in, n-1, out+1, x ):p4f0decx32(in, n-1, out+1, x); + case TB_VBYTE: VBGET32(in, x); *out = x; return inc?vbd1dec32( in, n-1, out+1, x ):vbddec32(in, n-1, out+1, x); + case TB_VSIMPLE: VBGET32(in, x); *out = x; in = vsdec32( in, n-1, out+1); bitundx32(out, n, -inc, inc); break; + case TB_FOR: VBGET32(in, x);*out = x; b = *in++; return inc?bitf1unpack32( in, n-1, out+1, x, b):bitfunpack32( in, n-1, out+1, x, b); + case TB_FORDA: VBGET32(in, x);*out = x; b = *in++; return inc?bitf1unpackx32( in, n-1, out+1, x, b):bitfunpackx32( in, n-1, out+1, x, b); + case TB_PACK: VBGET32(in, x);*out = x; b = *in++; return inc?bitd1unpack32( in, n-1, out+1, x, b):bitdunpack32( in, n-1, out+1, x, b); + case TB_PACK128V: VBGET32(in, x);*out = x; b = *in++; + if(n <= 128) { return inc?bitd1unpack32( in, n-1, out+1, x, b):bitdunpack32( in, n-1, out+1, x, b); } + else { return inc?bitd1unpack128v32 (in, out+1, x, b):bitdunpack128v32( in, out+1, x, b); } + #ifdef __AVX2__ + #if 0 + case TB_PACK256V: VBGET32(in, x);*out = x; b = *in++; + if(n <= 256) return inc?bitd1unpack32( in, n-1, out+1, x, b):bitdunpack32( in, n-1, out+1, x, b); + else { in = bitunpack256v32( in, out+1, b);bitundx32(out, n, -inc, inc); } break; + #else + case TB_PACK256V: VBGET32(in, x);*out = x; b = *in++; + if(n <= 256) { return inc?bitd1unpack32( in, n-1, out+1, x, b):bitdunpack32( in, n-1, out+1, x, b); } + else { return inc?bitd1unpack256v32(in, out+1, x, b):bitdunpack256v32( in, out+1, x, b); } + #endif + #endif + /*case TB_PACK128H: VBGET32(in, x);*out = x; b = *in++; + if(n <= 128) { return inc?bitd1unpack32( in, n-1, out+1, x, b):bitdunpack32( in, n-1, out+1, x, b); } + else { return inc?bitd1unpack128h32 (in, out+1, x, b):bitdunpack128h32( in, out+1, x, b); }*/ + + case AM_SIMPLE8B: VBGET32(in, x); *out = x; in = vs8bdec( in, n-1, out+1); bitundx32(out, n, -inc, inc); break; + + #if C_FASTPFOR + case FP_VBYTE: in = vbytedec( in, n, out); bitundx32(out, n, -inc, inc); break; + #endif + + #if C_LIBFOR + case LF_FORX: return for_selectx(in, n, out); //{ out[0] = *(unsigned *)in, b = in[4]; return for_selectx(in+5, n-1, out+1, out[0], b); } + case LF_FOR: return in + for_uncompress(in, out, n); + #endif + + #if C_MASKEDVBYTE + case P_MASKEDVBYTE: in += masked_vbyte_decode_delta(in, out, n, 0); break; + #endif + + #if C_POLYCOM + case PC_VBYTE: in = vbpolydec( in, n, out); bitundx32(out, n, -inc, inc); break; + case PC_SIMPLE16: VBGET32(in, x); *out = x; in = vs16dec((unsigned *)in, n-1, out+1); bitundx32(out, n, -inc, inc); break; + case PC_OPTPFD: VBGET32(in, x); *out = x; in = optpfddec32(in,n-1,out+1); bitundx32(out+1, n-1, x, inc); break; + /*if(n < 129) in = vbytedec(in, n, out); + else { _VBGET32(in, x, *out = x); unsigned all_array[2048]; in = (unsigned char *)detailed_p4_decode(out+1, (unsigned *)in, all_array); } + bitundx32(out, n, -inc, inc); break;*/ + #endif + //case P_QMX: { VBGET32(in, x); *out = x; unsigned l = *(unsigned *)in; in = qmx_dec(in+4, l, out+1, n-1); bitundx32(out+1, n-1, x, inc); break; } + #if C_QMX //case P_QMX: return qmx_dec(in+4, ctou32(in), out, n); + case P_QMX: { VBGET32(in, x); *out = x; unsigned l = *(unsigned *)in; ANT_compress_qmx qmx; qmx.decompress(out+1, n-1, in+4, ctou32(in)); bitundx32(out+1, n-1, x, inc); return in+4+ctou32(in);} + case P_QMX2: { VBGET32(in, x); *out = x; unsigned l = *(unsigned *)in; ANT_compress_qmx_v2 qmx; qmx.decompress(out+1, n-1, in+4, ctou32(in)); bitundx32(out+1, n-1, x, inc); return in+4+ctou32(in);} + case P_QMX3: { VBGET32(in, x); *out = x; unsigned l = *(unsigned *)in; ANT_compress_qmx_v3 qmx; qmx.decompress(out+1, n-1, in+4, ctou32(in)); bitundx32(out+1, n-1, x, inc); return in+4+ctou32(in);} + case P_QMX4: { VBGET32(in, x); *out = x; unsigned l = *(unsigned *)in; ANT_compress_qmx_v4 qmx; qmx.decompress(out+1, n-1, in+4, ctou32(in)); bitundx32(out+1, n-1, x, inc); return in+4+ctou32(in);} + #endif + + #if C_SIMDCOMP + case SC_SIMDPACK128: + if(n < 129) { in = vbytedec(in, n, out); bitundx32(out, n, -inc, inc); } + else { VBGET32(in, x);*out = x; b = *in++; in = simdunpackd1n((uint32_t *)in, n-1, out+1, out[0], b); } break; + + case SC_FOR: VBGET32(in, x);*out = x; b = *in++; if(n < 129) return (unsigned char *)simdunpackFOR_length(x, (const __m128i *)in, n-1, (unsigned *)(out+1), b); else { simdunpackFOR(x, (const __m128i *)in, (unsigned *)(out+1), b); return in + simdpackFOR_compressedbytes(n-1, b); } + case SC_FORDA: { VBGET32(in, x);*out++ = x; b = *in++; unsigned i; for(i=0; i < n-1;i++) out[i] = simdselectFOR(x, (const __m128i *)in, b, i); return in + simdpackFOR_compressedbytes(n-1, b); } + #endif + + #if C_STREAMVBYTE + case P_STREAMVBYTE: return in + streamvbyte_delta_decode(in, out, n, inc); + #endif + + #if C_VARINTG8IU + case P_VARINTG8IU: in = vintg8dec( in, n, out); bitundx32(out, n, -inc, inc); break; + #endif + //---------- transpose + lz77 ---------------------- + #if C_TRANSFORM + case TB_ZIGZAG_32:memcpy(out, in, n*4); bitunzigzag32(out, n, 0); return in + n*4; + case TB_TP8_32: _untranspose4( (unsigned char *)in, n*4, (unsigned char *)out); bitundx32(out, n, -inc, inc); return in + n*4; + case TB_TP8V_32: UNTRANSPOSE((unsigned char *)in, n*4, (unsigned char *)out); bitundx32(out, n, -inc, inc); return in + n*4; + case TB_TP4V_32: untransposen4((unsigned char *)in, n*4, (unsigned char *)out); bitundx32(out, n, -inc, inc); return in + n*4; + case TB_DELTA_32: memcpy(out, in, n*4); bitundx32(out, n, -inc, inc); return in + n*4; + #endif + //---------- delta + transpose + lz77 ---------------------- + #ifdef C_LZ4 + #if C_BITSHUFFLE + case LZ4_S1: in += LZ4_decompress_fast((char *)in, (char *)sbuf, n*4); BITUNSHUFFLE(sbuf, n*4, (unsigned char *)out); bitundx32(out, n, -inc, inc); break; + #endif + case LZ4_S4: in += LZ4_decompress_fast((char *)in, (char *)sbuf, n*4); untransposen4(sbuf, n*4, (unsigned char *)out); bitundx32(out, n, -inc, inc); break; + case LZ4_S8: in += LZ4_decompress_fast((char *)in, (char *)sbuf, n*4); UNTRANSPOSE(sbuf, n*4, (unsigned char *)out); bitundx32(out, n, -inc, inc); break; + #endif + + #if C_ZLIB + case P_ZLIB1: case P_ZLIB2: case P_ZLIB3: case P_ZLIB4: case P_ZLIB5: case P_ZLIB6: case P_ZLIB7: case P_ZLIB8: case P_ZLIB9: + { uLongf outsize = n*4; int l = *(unsigned *)in, rc = uncompress(sbuf, &outsize, in+4, l); in += 4 + l; UNTRANSPOSE(sbuf, n*4, (unsigned char *)out); bitundx32(out, n, -inc, inc); } break; + #endif + + #if C_C_BLOSC + case BS_LZ: + case BS_LZ4: + case BS_ZLIB: + case BS_LZ_1: + case BS_LZ4_1: + case BS_ZLIB_1: { blosc_decompress(in, out, n*4); size_t nbytes, cbytes,blocksize; blosc_cbuffer_sizes(in, &nbytes, &cbytes, &blocksize); return in+cbytes; } + #endif + + #if C_LZTURBO + case P_LZT10: { struct lzobj lz; lz.dstlen = n*4; lz.src = in; lz.dst = sbuf; lz.level = 0; in += lz8d(&lz); UNTRANSPOSE(sbuf, n*4, (unsigned char *)out); bitundx32(out, n, -inc, inc); } break; + case P_LZT20: + case P_LZT22: { struct lzobj lz; lz.dstlen = n*4; lz.src = in; lz.dst = sbuf; lz.level = 0; in += lzbd(&lz); UNTRANSPOSE(sbuf, n*4, (unsigned char *)out); bitundx32(out, n, -inc, inc); } break; + //case P_VSHUF: VBGET32(in, x);*out = x; in += ransdecompress( in, (n-1)*4, sbuf); vsdec32( sbuf, n-1, out+1); bitundx32(out, n, -inc, inc); break; + + case P_VSHUF: { unsigned inlen = ctou32(in); in+=4; in = mhdec(in, inlen, MH_BLK, sbuf, n*4); UNTRANSPOSE(sbuf, n*4, (unsigned char *)out); bitundx32(out, n, -inc, inc); } break; + //case P_VSHUF: { VBGET32(in, x);*out = x; unsigned inlen = ctou32(in); in+=4; in = mhdec(in, inlen, MH_BLK, sbuf, n*4); vsdec32(sbuf, n-1, out+1); bitundx32(out+1, n-1, x, inc); } break; + #endif + case P_MAX ... 63: break; + } + return in; +} + +unsigned char *codcomp(unsigned char *_in, unsigned _n, unsigned char *out, int outsize, int codec, int lev, char *prm, int b) { + unsigned *in = (unsigned *)_in, n = (_n+3) / 4; + int i,xb; //int outlen; unsigned char *oend=out+outsize; + switch(codec&0x3f) { + //---------- copy ---------------------------------------------------- + case P_COPY: return u32enc( in, n, (unsigned *)out); + case P_MCPY: return u32enc( in, n, (unsigned *)out); //memcpy(out, _in, _n); return out+_n; + // --------- variable byte ------------------------------------------- + case TB_VBYTE: return vbenc32( in, n, out); + case TB_VSIMPLE: return vsenc32( in, n, out); + case AM_SIMPLE8B: return vs8benc( in, n, out); + case TB_ELIASFANO:return out; + case TB_PFORDA: return p4encx32( in, n, out); + case TB_PFOR128: return n == 128?p4enc128v32(in, n, out):p4enc32(in, n, out); + #ifdef __AVX2__ + case TB_PFOR256: return n == 256?p4enc256v32(in, n, out):p4enc32(in, n, out); + #endif + case TB_FOR : + case TB_FORDA: + case TB_PACK128H: + case TB_PACK: if(b < 0) { BITSIZE32(in, n, b); *out++ = b; } return bitpack32(in, n, out, b); + case TB_PACK128V: if(b < 0) { BITSIZE32(in, n, b); *out++ = b; } return n != 128?bitpack32(in, n, out, b):bitpack128v32(in, out, b); + #ifdef __AVX2__ + case TB_PACK256V: if(b < 0) { BITSIZE32(in, n, b); *out++ = b; } return n != 256?bitpack32(in, n, out, b):bitpack256v32(in, out, b); + #endif + // case P_VU: return vuenc32( in, n, out); + + #if C_FASTPFOR + case FP_FASTPFOR: { + size_t nvalue = outsize/4; + FastPForLib::FastPFor<4> ic; ic.encodeArray((const int32_t *)in, n & (~127), (uint32_t *)(out+4), nvalue); + if(n & 127) { + size_t nvalue2 = outsize/4 - nvalue; + FastPForLib::VariableByte vc; vc.encodeArray((const int32_t *)(in + (n & (~127))), n & 127, (uint32_t *)(out + 4 + nvalue*4), nvalue2); + nvalue += nvalue2; + } + ctou32(out) = nvalue; + return out+4+nvalue*4; + } + case FP_SIMDFASTPFOR: { + size_t nvalue = outsize/4; + FastPForLib::SIMDFastPFor<4> ic; ic.encodeArray((const int32_t *)in, n & (~127), (uint32_t *)(out+4), nvalue); + if(n & 127) { + size_t nvalue2 = outsize/4 - nvalue; + FastPForLib::VariableByte vc; vc.encodeArray((const int32_t *)(in + (n & (~127))), n & 127, (uint32_t *)(out + 4 + nvalue*4), nvalue2); + nvalue += nvalue2; + } + ctou32(out) = nvalue; + return out+4+nvalue*4; + } + case FP_SIMDOPTPFOR: { + size_t nvalue = outsize/4; + FastPForLib::SIMDOPTPFor<4> ic; ic.encodeArray((const int32_t *)in, n & (~127), (uint32_t *)(out+4), nvalue); + if(n & 127) { + size_t nvalue2 = outsize/4 - nvalue; + FastPForLib::VariableByte vc; vc.encodeArray((const int32_t *)(in + (n & (~127))), n & 127, (uint32_t *)(out + 4 + nvalue*4), nvalue2); + nvalue += nvalue2; + } + ctou32(out) = nvalue; + return out+4+nvalue*4; + } + case FP_VBYTE: //return vbyteenc( in, i, (unsigned *)out); + { size_t nvalue=outsize/4; FastPForLib::VariableByte ic; ic.encodeArray((const int32_t *)in, (const size_t)n, (uint32_t *)(out+4), nvalue); ctou32(out)=nvalue; return out+4+nvalue*4; } + case FP_SIMPLE8BRLE: { size_t nvalue=outsize/4; FastPForLib::Simple8b_RLE ic; ic.encodeArray((const int32_t *)in, (const size_t)n, (uint32_t *)(out+4), nvalue); ctou32(out)=nvalue; return out+4+nvalue*4; } + #endif + + #if C_LIBFOR + case LF_FORX: //if(b < 0) b = maxbits(in), *out++ = b; return out + for_compress_bits(in, out, n, 0, b); + case LF_FOR: return out + for_compress_unsorted(in, out, n); + #endif + + #if C_LITTLEPACK + case LI_PACK: if(b < 0) { b = maxbits_length(in,n); *out++ = b; } pack32( in, n, b, out); return out + byte_count(n,b); + case LI_HORPACK: + case LI_TURBOPACK:if(b < 0) { b = maxbits_length(in,n); *out++ = b; } turbopack32(in, n, b, out); return out + byte_count(n,b); + case LI_SCPACK: if(b < 0) { b = maxbits_length(in,n); *out++ = b; } scpack32( in, n, b, out); return out + byte_count(n,b); + #ifdef __AVX2__ + case LI_BMIPACK: if(b < 0) { b = maxbits_length(in,n); *out++ = b; } bmipack32( in, n, b, out); return out + byte_count(n,b); + #endif + #endif + + #if C_MASKEDVBYTE + case P_MASKEDVBYTE: return out + vbyte_encode(in, n, out); + #endif + + #if C_POLYCOM + case PC_VBYTE: return vbpolyenc(in, n, out); + case PC_SIMPLE16: return vs16enc( in, n, (unsigned *)out); + case PC_RICE: return rcenc32( in, n, (unsigned *)out); + case PC_OPTPFD: return optpfdenc32(in, n, out); //if(n < 128) return vbyteenc(in, n, (unsigned *)out); else { unsigned tmp[2048]; for(i = 0; i < n; i++) tmp[i] = in[i]; return out + OPT4(tmp, n, (unsigned *)out); } + #endif + + #if C_QMX + case P_QMX: { ANT_compress_qmx qmx; unsigned r=qmx.compress(out+4, outsize, (uint32_t *)in, (uint64_t)n); ctou32(out)=r; return out+4+r; } // { unsigned char *q = qmx_enc(in, n, out+4); ctou32(out) = q - (out+4); return q; + case P_QMX2: { ANT_compress_qmx_v2 qmx; unsigned r=qmx.compress(out+4, outsize, (uint32_t *)in, (uint64_t)n); ctou32(out)=r; return out+4+r; } + case P_QMX3: { ANT_compress_qmx_v3 qmx; unsigned r=qmx.compress(out+4, outsize, (uint32_t *)in, (uint64_t)n); ctou32(out)=r; return out+4+r; } + case P_QMX4: { ANT_compress_qmx_v4 qmx; unsigned r=qmx.compress(out+4, outsize, (uint32_t *)in, (uint64_t)n); ctou32(out)=r; return out+4+r; } + #endif + + #if C_SIMDCOMP + //case SC_PACK: if(b < 0) { b = maxbits_length(in,n); *out++ = b; } return fastpackwithoutmask32(in, n, (uint32_t *)out, b); + case SC_FOR: + case SC_FORDA: + case SC_SIMDPACK128: if(b < 0) b = maxbits(in), *out++ = b; return (unsigned char *)simdpack_length(in, n, (__m128i *)out, b); + #ifdef __AVX2__ + case SC_SIMDPACK256: if(b < 0) b = avxmaxbits(in), *out++ = b; return (unsigned char *)avxpackwithoutmaskn(in, n, (unsigned *)out, b); + #endif + #endif + + #if C_STREAMVBYTE + case P_STREAMVBYTE: return out + streamvbyte_encode(in, n, out); + #endif + #if C_VARINTG8IU + case P_VARINTG8IU: return vintg8enc(in, n, out); + #endif + + // --------- transform ---------------------------------------- + #if C_TRANSFORM + case TB_ZIGZAG_32: bitzigzag32(in, n, (unsigned *)out, 0); return out + n*4; + case TB_TP8_32: _transpose4( (unsigned char *)in, n*4, out); return out + n*4; + case TB_TP8V_32: TRANSPOSE( (unsigned char *)in, n*4, out); return out + n*4; + case TB_TP4V_32: transposen4( (unsigned char *)in, n*4, out); return out + n*4; + #endif + #if C_C_BLOSC + case BS_SHUFFLE: shuffle( 4, n*4, (unsigned char *)in, out); return out + n*4; + #endif + + #if C_BITSHUFFLE + case P_BITSHUFFLE: bshuf_bitshuffle(in, out, n, 4, 0); return out + n*4; + #endif + + // --------- transpose + lz77 ---------------------------------------- + #if C_LZTURBO + case P_LZT10: { n *= 4; TRANSPOSE( (unsigned char *)in, n, sbuf); struct lzobj lz; lz.srclen = n; lz.src = sbuf; lz.dst = out; lz.dstlen = n; lz.level = 0; lz.hbits = 16; return out + lz8c01(&lz); } + case P_LZT20: { n *= 4; TRANSPOSE( (unsigned char *)in, n, sbuf); struct lzobj lz; lz.srclen = n; lz.src = sbuf; lz.dst = out; lz.dstlen = n; lz.level = 0; lz.hbits = 16; return out + lzbc01(&lz); } + case P_LZT22: { n *= 4; TRANSPOSE( (unsigned char *)in, n, sbuf); struct lzobj lz; lz.srclen = n; lz.src = sbuf; lz.dst = out; lz.dstlen = n; lz.level = 2; lz.hbits = 26; return out + lzbc2(&lz); } + case P_LZT32: { n *= 4; TRANSPOSE( (unsigned char *)in, n, sbuf); struct lzobj lz; lz.srclen = n; lz.src = sbuf; lz.dst = out; lz.dstlen = n; lz.level = 2; lz.hbits = 26; return out + lzhc2(&lz); } + //case P_VSHUF: { unsigned char *p = vsenc32(in, n, sbuf); p = mheenc(sbuf, p-sbuf, MH_BLK, out+2, out+n*4+256); ctou16(out) = p-(out+2); printf("L=%d ", p-(out+2)); return p; } //out + ranscompress( sbuf, xb, out, out+n*4); + //case P_VSHUF: { unsigned char *p = mheenc(in, n*4, MH_BLK, out+4, out+n*4+256); ctou32(out) = p-(out+4); return p; } //out + ranscompress( sbuf, xb, out, out+n*4); + case P_VSHUF: { unsigned char *p;/* = vsenc32(in, n, sbuf)*/ n *= 4; TRANSPOSE( (unsigned char *)in, n, sbuf); p = mheenc(sbuf, n/*p-sbuf*/, MH_BLK, out+4, out+n+256); ctou32(out) = p-(out+4); return p; } //out + ranscompress( sbuf, xb, out, out+n*4); + #endif + + #if C_BITSHUFFLE + case TB_VS_S1: BITSHUFFLE( (unsigned char *)in, n*4, sbuf); return vsenc32((unsigned *)sbuf, n, out); + #endif + case TB_VS_S4: transposen4( (unsigned char *)in, n*4, sbuf); return vsenc32((unsigned *)sbuf, n, out); + case TB_VS_S8: TRANSPOSE( (unsigned char *)in, n*4, sbuf); return vsenc32((unsigned *)sbuf, n, out); + + #if C_LZ4 + #if C_BITSHUFFLE + case LZ4_S1: BITSHUFFLE( (unsigned char *)in, n*4, sbuf); return out + LZ4_compress((char *)sbuf, (char *)out, n*4); + #endif + case LZ4_S4: transposen4( (unsigned char *)in, n*4, sbuf); return out + LZ4_compress((char *)sbuf, (char *)out, n*4); + case LZ4_S8: TRANSPOSE( (unsigned char *)in, n*4, sbuf); return out + LZ4_compress((char *)sbuf, (char *)out, n*4); + #endif + + #if C_C_BLOSC + case BS_LZ: + case BS_LZ4: + case BS_ZLIB: return out + blosc_compress(1/*clevel*/, BLOSC_SHUFFLE, 4/*typesize*/, n*4, in, out, n*4+BLOSC_MAX_OVERHEAD); + case BS_LZ_1: + case BS_LZ4_S1: + case BS_ZLIB_1: return out + blosc_compress(1/*clevel*/, BLOSC_BITSHUFFLE, 4/*typesize*/, n*4, in, out, n*4+BLOSC_MAX_OVERHEAD); + #endif + + #if C_ZLIB + case P_ZLIB1: case P_ZLIB2: case P_ZLIB3: case P_ZLIB4: case P_ZLIB5: case P_ZLIB6: case P_ZLIB7: case P_ZLIB8: case P_ZLIB9: + { n *= 4; TRANSPOSE( (unsigned char *)in, n, sbuf); uLongf outlen = n; int rc = compress2(out+4, &outlen, sbuf, n, codec-P_ZLIB1+1); if(rc != Z_OK) die("zlib compress2 rc=%d\n", rc); *(unsigned *)out = outlen; return out + 4 + outlen; } + #endif + + case P_MAX ... 63: die("library '%d' not included\n", codec); + } + return out; +} + +unsigned char *coddecomp(unsigned char *in, unsigned _n, unsigned char *_out, int outlen, int codec, int lev, char *prm, int b) { + unsigned *out = (unsigned *)_out, n = (outlen+3) / 4; + switch(codec&0x3f) { + case P_COPY: return u32dec( (unsigned *)in, n, out); + case P_MCPY: return u32dec( (unsigned *)in, n, out); //memcpy(_out, in, _n); return in+_n; + + // case P_VU: return vudec32( in, n, out); + case TB_VBYTE: return vbdec32( in, n, out); + case TB_PFOR128 : __builtin_prefetch(in+256);return n == 128?p4dec128v32(in, n, out):p4dec32(in, n, out); + #ifdef __AVX2__ + case TB_PFOR256 : __builtin_prefetch(in+256);return n == 256?p4dec256v32(in, n, out):p4dec32(in, n, out); + #endif + case TB_PFORDA : return p4decx32( in, n, out); + case AM_SIMPLE8B: return vs8bdec( in, n, out); + case TB_VSIMPLE: return vsdec32( in, n, out); + case TB_ELIASFANO: return in; + case TB_FOR: if(b < 0) b = *in++; return bitfunpack32( in, n, out, 0, b); + case TB_FORDA: if(b < 0) b = *in++; return _bitunpackx32( in, n, out, b); + case TB_PACK: if(b < 0) b = *in++; return bitunpack32( in, n, out, b); + //case TB_PACK128H: if(b < 0) b = *in++; return n != 128?bitunpack32(in, n, out, b):bitunpack128h32(in, out, b); + case TB_PACK128V: if(b < 0) b = *in++; return n != 128?bitunpack32(in, n, out, b):bitunpack128v32(in, out, b); + #ifdef __AVX2__ + case TB_PACK256V: if(b < 0) b = *in++; return n != 256?bitunpack32(in, n, out, b):bitunpack256v32(in, out, b); + #endif + + #if C_FASTPFOR + case FP_FASTPFOR: { + size_t nvalue = n; + FastPForLib::FastPFor<4> ic; const uint32_t *ip = ic.decodeArray((const int32_t *)(in+4), ctou32(in), out, nvalue); + if(n & 127) { + nvalue = n - nvalue; + FastPForLib::VariableByte vc; + return (unsigned char *)vc.decodeArray(ip, (const uint32_t *)in+1+ctou32(in) - ip, out + (n&(~127)), nvalue); //return vbdec32((unsigned char *)ip, n & 127, out + mynvalue1); + } + return (unsigned char *)ip; + } + + case FP_SIMDFASTPFOR: { + size_t nvalue = n; + FastPForLib::SIMDFastPFor<4> ic; const uint32_t *ip = ic.decodeArray((const int32_t *)(in+4), ctou32(in), out, nvalue); + if(n & 127) { + nvalue = n - nvalue; + FastPForLib::VariableByte vc; + return (unsigned char *)vc.decodeArray(ip, (const uint32_t *)in+1+ctou32(in) - ip, out + (n&(~127)), nvalue); //return vbdec32((unsigned char *)ip, n & 127, out + mynvalue1); + } + return (unsigned char *)ip; + } + case FP_SIMDOPTPFOR: { + size_t nvalue = n; + FastPForLib::SIMDOPTPFor<4> ic; const uint32_t *ip = ic.decodeArray((const int32_t *)(in+4), ctou32(in), out, nvalue); + if(n & 127) { + nvalue = n - nvalue; + FastPForLib::VariableByte vc; + return (unsigned char *)vc.decodeArray(ip, (const uint32_t *)in+1+ctou32(in) - ip, out + (n&(~127)), nvalue); //return vbdec32((unsigned char *)ip, n & 127, out + mynvalue1); + } + return (unsigned char *)ip; + } + + case FP_VBYTE: //return vbytedec( in, n, out); + { size_t nvalue=n; FastPForLib::VariableByte ic; return (unsigned char *)ic.decodeArray((const int32_t *)(in+4), ctou32(in), (uint32_t *)out, nvalue); } + case FP_SIMPLE8BRLE: + { size_t nvalue=n; FastPForLib::Simple8b_RLE ic; ic.decodeArray((const int32_t *)(in+4), ctou32(in), (uint32_t *)out, nvalue); return in+4+ctou32(in)*4; } + #endif + + #if C_LIBFOR + case LF_FORX: return for_selectx(in, n, out); //if(b < 0) b = *in++; return for_selectx(in, n, out, 0, b);// + case LF_FOR: return in + for_uncompress(in, out, n); //if(b < 0) b = *in++; return in + for_uncompress_bits(in, out, n, 0, b); //return in + for_uncompress(in, out, n); + #endif + + #if C_LITTLEPACK + case LI_PACK: if(b < 0) b = *in++; unpack32( in, n, b, out); return in + byte_count(n,b); + case LI_TURBOPACK: if(b < 0) b = *in++; turbounpack32( in, n, b, out); return in + byte_count(n,b); + case LI_SCPACK: if(b < 0) b = *in++; scunpack32( in, n, b, out); return in + byte_count(n,b); + case LI_HORPACK: if(b < 0) b = *in++; horizontalunpack32(in, n, b, out); return in + byte_count(n,b); + #ifdef __AVX2__ + case LI_BMIPACK: if(b < 0) b = *in++; bmiunpack32( in, n, b, out); return in + byte_count(n,b); + #endif + #endif + + #if C_MASKEDVBYTE + case P_MASKEDVBYTE: return in + masked_vbyte_decode(in, out, n); + #endif + + #if C_POLYCOM + case PC_VBYTE: return vbpolydec(in, n, out); + case PC_SIMPLE16: return vs16dec( (unsigned *)in, n, out); + case PC_RICE: return rcdec32( (unsigned *)in, n, out); + case PC_OPTPFD : return optpfddec32( in, n, out); //if(n < 128) return vbytedec(in, n, out); else { unsigned all_array[2048]; return (unsigned char *)detailed_p4_decode(out, (unsigned *)in, all_array); } + #endif + + #if C_SIMDCOMP + //case SC_PACK: if(b < 0) b = *in++; return fastunpack32((uint32_t *)in, n, out, b); + case SC_FOR: + case SC_SIMDPACK128: if(b < 0) b = *in++; return (unsigned char *)simdunpack_length( (__m128i *)in, n, out, b); + //case SC_FORDA: if(b < 0) b = *in++; return simdfor_selectx(in, n, out, 0, b); + #ifdef __AVX2__ + case SC_SIMDPACK256: if(b < 0) b = *in++; return (unsigned char *)avxunpackn( (unsigned *)in, n, out, b); + #endif + #endif + + #if C_STREAMVBYTE + case P_STREAMVBYTE: return in + streamvbyte_decode(in, out, n); + #endif + + #if C_QMX //case P_QMX: return qmx_dec(in+4, ctou32(in), out, n); + case P_QMX: { ANT_compress_qmx qmx; qmx.decompress(out, n, in+4, ctou32(in)); return in+4+ctou32(in);} // { unsigned char *q = qmx_enc(in, n, out+4); ctou32(out) = q - (out+4); return q; + case P_QMX2: { ANT_compress_qmx_v2 qmx; qmx.decompress(out, n, in+4, ctou32(in)); return in+4+ctou32(in);} + case P_QMX3: { ANT_compress_qmx_v3 qmx; qmx.decompress(out, n, in+4, ctou32(in)); return in+4+ctou32(in);} + case P_QMX4: { ANT_compress_qmx_v4 qmx; qmx.decompress(out, n, in+4, ctou32(in)); return in+4+ctou32(in);} + #endif + + #if C_VARINTG8IU + case P_VARINTG8IU: return vintg8dec(in, n, out); + #endif + //---------- transpose + lz77 ---------------------- + #if C_TRANSFORM + case TB_ZIGZAG_32: memcpy(out, in, n*4); bitunzigzag32(out, n, 0); return in + n*4; + case TB_TP8_32: _untranspose4( (unsigned char *)in, n*4, (unsigned char *)out); return in + n*4; + case TB_TP8V_32: UNTRANSPOSE( (unsigned char *)in, n*4, (unsigned char *)out); return in + n*4; + case TB_TP4V_32: untransposen4( (unsigned char *)in, n*4, (unsigned char *)out); return in + n*4; + #endif + #if C_C_BLOSC + case BS_SHUFFLE: unshuffle( 4, n*4, (unsigned char *)in, (unsigned char *)out); return in + n*4; + #endif + + #if C_BITSHUFFLE + case P_BITSHUFFLE: bshuf_bitunshuffle(in, out, n, 4, 0); return in + n*4; + #endif + + //---------- transpose + lz77 ---------------------- + #if C_LZTURBO + case P_LZT10: { struct lzobj lz; lz.dstlen = n*4; lz.src = in; lz.dst = sbuf; lz.level = 0; in += lz8d(&lz); UNTRANSPOSE(sbuf, n*4, (unsigned char *)out); } break; + case P_LZT20: + case P_LZT22: { struct lzobj lz; lz.dstlen = n*4; lz.src = in; lz.dst = sbuf; lz.level = 0; in += lzbd(&lz); UNTRANSPOSE(sbuf, n*4, (unsigned char *)out); } break; + case P_LZT32: { struct lzobj lz; lz.dstlen = n*4; lz.src = in; lz.dst = sbuf; lz.level = 0; in += lzhd(&lz); UNTRANSPOSE(sbuf, n*4, (unsigned char *)out); } break; + //case P_VSHUF: { unsigned inlen = ctou16(in); printf("l=%d ", inlen); in+=2; in = mhdec(in, inlen, MH_BLK, sbuf, n*4); vsdec32(sbuf, n, out); } break;// in = ransdecompress( in, n*4, sbuf); + //case P_VSHUF: { unsigned inlen = ctou32(in); in+=4; in = mhdec(in, inlen, MH_BLK, out, n*4); } break;// in = ransdecompress( in, n*4, sbuf); + case P_VSHUF: { unsigned inlen = ctou32(in); in+=4; in = mhdec(in, inlen, MH_BLK, sbuf, n*4); UNTRANSPOSE(sbuf, n*4, (unsigned char *)out); /*vsdec32(sbuf, n, out);*/ } break;// in = ransdecompress( in, n*4, sbuf); + #endif + + #if C_BITSHUFFLE + case TB_VS_S1: in = vsdec32(in, n, (unsigned *)sbuf); BITUNSHUFFLE(sbuf, n*4, (unsigned char *)out); break;/*bshuf_bitunshuffle(sbuf, out, n*4/32, 32, 0); */ + #endif + case TB_VS_S4: in = vsdec32(in, n, (unsigned *)sbuf); untransposen4(sbuf, n*4, (unsigned char *)out); break; + case TB_VS_S8: in = vsdec32(in, n, (unsigned *)sbuf); UNTRANSPOSE(sbuf, n*4, (unsigned char *)out); break;/*bshuf_bitunshuffle(sbuf, out, n*4/32, 32, 0); */ + + #if C_LZ4 + #if C_BITSHUFFLE + case LZ4_S1: in += LZ4_decompress_fast((char *)in, (char *)sbuf, n*4); BITUNSHUFFLE(sbuf, n*4, (unsigned char *)out); break;/*bshuf_bitunshuffle(sbuf, out, n*4/32, 32, 0); */ + #endif + case LZ4_S4: in += LZ4_decompress_fast((char *)in, (char *)sbuf, n*4); untransposen4(sbuf, n*4, (unsigned char *)out); break; + case LZ4_S8: in += LZ4_decompress_fast((char *)in, (char *)sbuf, n*4); UNTRANSPOSE(sbuf, n*4, (unsigned char *)out); break;/*bshuf_bitunshuffle(sbuf, out, n*4/32, 32, 0); */ + #endif + + #if C_BLOSC + case BS_LZ: + case BS_LZ4: + case BS_ZLIB: + case BS_LZ_1: + case BS_LZ4_1: + case BS_ZLIB_1: { blosc_decompress(in, out, n*4); size_t nbytes, cbytes,blocksize; blosc_cbuffer_sizes(in, &nbytes, &cbytes, &blocksize); return in+cbytes; } + #endif + + #if C_ZLIB + case P_ZLIB1: case P_ZLIB2: case P_ZLIB3: case P_ZLIB4: case P_ZLIB5: case P_ZLIB6: case P_ZLIB7: case P_ZLIB8: case P_ZLIB9: + { uLongf outsize = n*4; int l = *(unsigned *)in, rc = uncompress(sbuf, &outsize, in+4, l); in += 4 + l; UNTRANSPOSE(sbuf, n*4, (unsigned char *)out); } break; + #endif + case P_MAX ... 63: die("library '%d' not included\n", codec); + } + return in; +} + +char *codver(int codec, char *v, char *s) { + switch(codec) { + #if C_C_BLOSC2 + return BLOSC_VERSION_STRING; + #endif + #if C_LZ4 + case LZ4_: sprintf(s,"%d.%d.%d", LZ4_VERSION_MAJOR, LZ4_VERSION_MINOR, LZ4_VERSION_RELEASE); return s; + #endif + + #if C_ZSTD + case P_ZSTD: sprintf(s,"%d.%d.%d", ZSTD_VERSION_MAJOR, ZSTD_VERSION_MINOR, ZSTD_VERSION_RELEASE); return s; + #endif + + default: strcpy(s,v); + } + return s; +} diff --git a/plugins.h b/plugins.h new file mode 100644 index 0000000..e0b4c9a --- /dev/null +++ b/plugins.h @@ -0,0 +1,54 @@ +/** + Copyright (C) powturbo 2013-2017 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - homepage : https://sites.google.com/site/powturbo/ + - github : https://github.com/powturbo + - twitter : https://twitter.com/powturbo + - email : powturbo [_AT_] gmail [_DOT_] com +**/ +// TurboPFor: plugins.h - settings + +struct plugs { + int id; + char *s; + int codec; + char *ver,*name,*lic,*url,*lev; + unsigned flag,blksize; +}; + + #ifdef __cplusplus +extern "C" { + #endif +extern struct plugs plugs[]; +int codini(size_t insize, int codec); +void codexit(int codec); +int codstart( unsigned char *in, int inlen, int codec); +unsigned char *codcomp( unsigned char *in, unsigned n, unsigned char *out, int outsize, int codec, int lev, char *prm, int b); +unsigned char *codcomps( unsigned char *in, unsigned n, unsigned char *out, int outsize, int codec, int lev, char *prm, int inc); +unsigned char *coddecomp( unsigned char *in, unsigned n, unsigned char *out, int outlen, int codec, int lev, char *prm, int b); +unsigned char *coddecomps(unsigned char *in, unsigned n, unsigned char *out, int outlen, int codec, int lev, char *prm, int inc); +char *codver(int codec, char *v, char *s); +void *_valloc(size_t size, int a); +void _vfree(void *p, size_t size); + +typedef unsigned char *(*CODCOMP)( unsigned char *_in, unsigned _n, unsigned char *out, int outsize, int codec, int lev, char *prm, int b); +typedef unsigned char *(*CODDECOMP)(unsigned char *in, unsigned _n, unsigned char *_out, int outlen, int codec, int lev, char *prm, int b); + + #ifdef __cplusplus +} + #endif diff --git a/transpose.c b/transpose.c index d3f0e48..bec4631 100644 --- a/transpose.c +++ b/transpose.c @@ -1,5 +1,5 @@ /** - Copyright (C) powturbo 2013-2015 + Copyright (C) powturbo 2013-2017 GPL v2 License This program is free software; you can redistribute it and/or modify @@ -21,7 +21,7 @@ - twitter : https://twitter.com/powturbo - email : powturbo [_AT_] gmail [_DOT_] com **/ -// transpose.c - nibble/byte transpose +// Nibble/Byte transpose #if !defined(TRANSPOSE) && !defined(TRANSPOSEV) #include diff --git a/transpose.h b/transpose.h index 449ee8f..b77c81d 100644 --- a/transpose.h +++ b/transpose.h @@ -1,5 +1,5 @@ /** - Copyright (C) powturbo 2013-2015 + Copyright (C) powturbo 2013-2017 GPL v2 License This program is free software; you can redistribute it and/or modify diff --git a/vint.c b/vint.c index bafc6d6..5a956e1 100644 --- a/vint.c +++ b/vint.c @@ -21,20 +21,27 @@ - twitter : https://twitter.com/powturbo - email : powturbo [_AT_] gmail [_DOT_] com **/ -// vint.c - "Integer Compression" variable byte -#include - +// vint.c - "Integer Compression" variable byte #ifndef USIZE +#include +#include + #include "conf.h" #include "vint.h" #include "bitutil.h" -#define UN 8 +#define UN 8 // 4 // + +#define VDELTA 0 +#define VBDENC vbdenc +#define VBDDEC vbddec +#define VBDGETX vbdgetx +#define VBDGETGEQ vbdgetgeq #define USIZE 32 #include __FILE__ #undef USIZE - + #define USIZE 64 #include __FILE__ #undef USIZE @@ -43,197 +50,245 @@ #include __FILE__ #undef USIZE -#define USIZE 15 +#define VDELTA 1 +#define VBDENC vbd1enc +#define VBDDEC vbd1dec +#define VBDGETX vbd1getx +#define VBDGETGEQ vbd1getgeq + +#define USIZE 32 +#include __FILE__ +#undef USIZE + +#define USIZE 64 +#include __FILE__ +#undef USIZE + +#define USIZE 16 #include __FILE__ #undef USIZE -#else - #if USIZE == 15 -#define uint_t uint16_t -#define zigzagenc15 zigzagenc16 -#define zigzagdec15 zigzagdec16 #else #define uint_t TEMPLATE3(uint, USIZE, _t) - #endif - + + #if VDELTA == 0 +#define OVERFLOWD(in,n,out,vbmax) if(*in == vbmax) { memcpy(out, in+1, n*(USIZE/8)); return in+1+n*(USIZE/8); } +#define OVERFLOWE(in,n,out,op,vbmax) if(op > out + n*(USIZE/8)) { *out = vbmax; memcpy(out+1, in, n*(USIZE/8)); op = out+1+n*(USIZE/8); } + +//#define RLE(_ip_,_op_) if(_ip_+1 < e && *_ip_ == *(_ip_+1)) { uint_t *_q = _ip_+1; while(_q+1 < e && *(_q+1) == *_ip_) _q++; unsigned _r = _q - _ip_;\ +// { _ip_+=_r; _r--; *_op_++= _ip_[0]?252:253; vbput32(op, _r); if(_ip_[0]) TEMPLATE2(vbput, USIZE)(op, _ip_[0]); }} +//#define RLE(_ip_,_op_) + unsigned char *TEMPLATE2(vbdec, USIZE)(unsigned char *__restrict in, unsigned n, uint_t *__restrict out) { register uint_t x, *op; - for(op = out; op != out+(n&~(UN-1)); op += UN) { - TEMPLATE2(_vbget, USIZE)(in, x, op[0] = x); - TEMPLATE2(_vbget, USIZE)(in, x, op[1] = x); - TEMPLATE2(_vbget, USIZE)(in, x, op[2] = x); - TEMPLATE2(_vbget, USIZE)(in, x, op[3] = x); __builtin_prefetch(in+8*USIZE, 0); + OVERFLOWD(in, n, out, VB_MAX); + #define VBE(_i_) TEMPLATE2(_vbget, USIZE)(in, x, op[_i_] = x) + for(op = out; op != out+(n&~(UN-1)); op += UN) { VBE(0); VBE(1); VBE(2); VBE(3); __builtin_prefetch(in+8*USIZE, 0); #if UN > 4 - TEMPLATE2(_vbget, USIZE)(in, x, op[4] = x); - TEMPLATE2(_vbget, USIZE)(in, x, op[5] = x); - TEMPLATE2(_vbget, USIZE)(in, x, op[6] = x); - TEMPLATE2(_vbget, USIZE)(in, x, op[7] = x); - #endif + VBE(4); VBE(5); VBE(6); VBE(7); + #endif } - while(op != out+n) - TEMPLATE2(_vbget, USIZE)(in, x, *op++ = x ); + while(op != out+n) TEMPLATE2(_vbget, USIZE)(in, x, *op++ = x ); return in; } +#undef VBE unsigned char *TEMPLATE2(vbenc, USIZE)(uint_t *__restrict in, unsigned n, unsigned char *__restrict out) { - register uint_t x, *ip; - for(ip = in; ip != in+(n&~(UN-1)); ip += UN) { __builtin_prefetch(ip+USIZE*8, 0); - x = ip[0]; TEMPLATE2(_vbput, USIZE)(out, x, ;); - x = ip[1]; TEMPLATE2(_vbput, USIZE)(out, x, ;); - x = ip[2]; TEMPLATE2(_vbput, USIZE)(out, x, ;); - x = ip[3]; TEMPLATE2(_vbput, USIZE)(out, x, ;); - #if UN > 4 - x = ip[4]; TEMPLATE2(_vbput, USIZE)(out, x, ;); - x = ip[5]; TEMPLATE2(_vbput, USIZE)(out, x, ;); - x = ip[6]; TEMPLATE2(_vbput, USIZE)(out, x, ;); - x = ip[7]; TEMPLATE2(_vbput, USIZE)(out, x, ;); - #endif - } - while(ip != in+n) { - x = *ip++; TEMPLATE2(_vbput, USIZE)(out, x, ;); - } - return out; -} - -unsigned char *TEMPLATE2(vbdenc, USIZE)(uint_t *__restrict in, unsigned n, unsigned char *__restrict out, uint_t start) { - uint_t *ip,v; - for(ip = in; ip != in+(n&~(4-1)); ) { - v = (*ip)-start; start = *ip++; TEMPLATE2(_vbput, USIZE)(out, v, ;); - v = (*ip)-start; start = *ip++; TEMPLATE2(_vbput, USIZE)(out, v, ;); - v = (*ip)-start; start = *ip++; TEMPLATE2(_vbput, USIZE)(out, v, ;); - v = (*ip)-start; start = *ip++; TEMPLATE2(_vbput, USIZE)(out, v, ;); - } - while(ip != in+n) { - v = (*ip)-start; start = *ip++; TEMPLATE2(_vbput, USIZE)(out, v, ;); - } - return out; -} - -unsigned char *TEMPLATE2(vbddec, USIZE)(unsigned char *__restrict in, unsigned n, uint_t *__restrict out, uint_t start) { - uint_t x,*op; - for(op = out; op != out+(n&~(UN-1)); ) { - TEMPLATE2(_vbget, USIZE)(in, x, ;); *op++ = (start += x); - TEMPLATE2(_vbget, USIZE)(in, x, ;); *op++ = (start += x); - TEMPLATE2(_vbget, USIZE)(in, x, ;); *op++ = (start += x); - TEMPLATE2(_vbget, USIZE)(in, x, ;); *op++ = (start += x); - TEMPLATE2(_vbget, USIZE)(in, x, ;); *op++ = (start += x); - #if UN > 4 - TEMPLATE2(_vbget, USIZE)(in, x, ;); *op++ = (start += x); - TEMPLATE2(_vbget, USIZE)(in, x, ;); *op++ = (start += x); - TEMPLATE2(_vbget, USIZE)(in, x, ;); *op++ = (start += x); - TEMPLATE2(_vbget, USIZE)(in, x, ;); *op++ = (start += x); - TEMPLATE2(_vbget, USIZE)(in, x, ;); *op++ = (start += x); - #endif - } - while(op != out+n) _vbget32(in, x, *op++ = (start += x)); - return in; -} - -#define VINT_Z 32 -unsigned char *TEMPLATE2(vbd1enc, USIZE)(uint_t *__restrict in, unsigned n, unsigned char *__restrict out, uint_t start) { - uint_t *ip, v; + register uint_t x, *ip, *e=in+n; unsigned char *op = out; - - #if VINT_Z == USIZE - #define VINTZ(x) x - uint_t b = 0; - v = in[0] - start - 1; - unsigned long long u = (unsigned long long)v<<1; - if(n == 1) u |= 1; - TEMPLATE2(_vbput, USIZE)(op, u, ;); - if(!--n) return op; - start = *in++; - #else - #define VINTZ(x) - #endif - - for(ip = in; ip != in + (n&~(4-1)); ) { - v = (*ip)-start-1; start = *ip++; TEMPLATE2(_vbput, USIZE)(op, v, ;); VINTZ(b |= v); - v = (*ip)-start-1; start = *ip++; TEMPLATE2(_vbput, USIZE)(op, v, ;); VINTZ(b |= v); - v = (*ip)-start-1; start = *ip++; TEMPLATE2(_vbput, USIZE)(op, v, ;); VINTZ(b |= v); - v = (*ip)-start-1; start = *ip++; TEMPLATE2(_vbput, USIZE)(op, v, ;); VINTZ(b |= v); + #define VBD(_i_) x = ip[_i_]; TEMPLATE2(_vbput, USIZE)(op, x, ;); + for(ip = in; ip != in+(n&~(UN-1)); ip += UN) { __builtin_prefetch(ip+USIZE*8, 0); + VBD(0); VBD(1); VBD(2); VBD(3); + #if UN > 4 + VBD(4); VBD(5); VBD(6); VBD(7); + #endif } while(ip != in+n) { - v = (*ip)-start-1; start = *ip++; TEMPLATE2(_vbput, USIZE)(op, v, ;); VINTZ(b |= v); - } - - #if VINT_Z == USIZE - if(!b) { - u = (unsigned long long)in[-1] << 1 | 1; - TEMPLATE2(_vbput, USIZE)(out, u, ;); - return out; + x = *ip++; + TEMPLATE2(_vbput, USIZE)(op, x, ;); } - #endif + OVERFLOWE(in,n,out,op,VB_MAX); return op; } -#undef VINTZ +#undef VBD -unsigned char *TEMPLATE2(vbd1dec, USIZE)(unsigned char *__restrict in, unsigned n, uint_t *__restrict out, uint_t start) { - uint_t x,*op; - #if VINT_Z == USIZE - unsigned long long u; TEMPLATE2(_vbget, USIZE)(in, u, ;); x = u>>1; *out = (start += x+1); - if(u & 1) { - #ifdef __SSE2__ - out++; --n; BITDIZERO32(out, n, start, 1); - #else - for(x = 1; x < n; x++) out[x] = start+x; - #endif - return in; - } - out++; --n; - #endif - - for(op = out; op != out+(n&~(8-1)); ) { - TEMPLATE2(_vbget, USIZE)(in, x, ++x); *op++ = (start += x); - TEMPLATE2(_vbget, USIZE)(in, x, ++x); *op++ = (start += x); - TEMPLATE2(_vbget, USIZE)(in, x, ++x); *op++ = (start += x); - TEMPLATE2(_vbget, USIZE)(in, x, ++x); *op++ = (start += x); - #if UN > 4 - TEMPLATE2(_vbget, USIZE)(in, x, ++x); *op++ = (start += x); - TEMPLATE2(_vbget, USIZE)(in, x, ++x); *op++ = (start += x); - TEMPLATE2(_vbget, USIZE)(in, x, ++x); *op++ = (start += x); - TEMPLATE2(_vbget, USIZE)(in, x, ++x); *op++ = (start += x); - #endif - } - while(op != out+n) { - TEMPLATE2(_vbget, USIZE)(in, x, ++x); *op++ = (start += x); - } - return in; +uint_t TEMPLATE2(vbgetx, USIZE)(unsigned char *__restrict in, unsigned idx) { + unsigned char *ip; + unsigned i; + uint_t x; + if(*in == 255) return TEMPLATE2(ctou, USIZE)(in+1+idx*(USIZE/8)); + for(ip = in,i = 0; i <= idx; i++) + ip += TEMPLATE2(_vbvlen, USIZE)(*ip); + TEMPLATE2(_vbget, USIZE)(in, x, ;); + return x; } -#undef VINT_Z +/*unsigned TEMPLATE2(vbgeteq, USIZE)(unsigned char *__restrict in, unsigned n, uint_t key, unsigned char **__restrict _ip) { + unsigned i; + unsigned char *ip; + uint_t x; + if(*in == 255) { + for(ip = (*_ip==in)?in:*ip; ip < in+n; ip+USIZE/8) { + TEMPLATE2(_vbget, USIZE)(ip, x, ;); + if((x = TEMPLATE2(ctou, USIZE)(ip)) == key) break; + } + } else for(ip = *_ip,i=idx; i < n; i++) { + TEMPLATE2(_vbget, USIZE)(ip, x, ;); + if(x == key) break; + } + *_ip = ip; + return i; +}*/ unsigned char *TEMPLATE2(vbzenc, USIZE)(uint_t *__restrict in, unsigned n, unsigned char *__restrict out, uint_t start) { uint_t *ip,v; - for(ip = in; ip != in+(n&~(4-1)); ) { - v = TEMPLATE2(zigzagenc, USIZE)((*ip)-start); start=*ip++; TEMPLATE2(_vbput, USIZE)(out, v, ;); - v = TEMPLATE2(zigzagenc, USIZE)((*ip)-start); start=*ip++; TEMPLATE2(_vbput, USIZE)(out, v, ;); - v = TEMPLATE2(zigzagenc, USIZE)((*ip)-start); start=*ip++; TEMPLATE2(_vbput, USIZE)(out, v, ;); - v = TEMPLATE2(zigzagenc, USIZE)((*ip)-start); start=*ip++; TEMPLATE2(_vbput, USIZE)(out, v, ;); - } - while(ip < in+n) { - v = TEMPLATE2(zigzagenc, USIZE)((*ip)-start); start=*ip++; TEMPLATE2(_vbput, USIZE)(out, v, ;); - } - return out; + unsigned char *op = out; + #define VBZE { v = TEMPLATE2(zigzagenc, USIZE)((*ip)-start); start=*ip++; TEMPLATE2(_vbput, USIZE)(op, v, ;); } + for(ip = in; ip != in+(n&~(4-1)); ) { VBZE;VBZE;VBZE;VBZE; } + while(ip < in+n) VBZE; + //OVERFLOWE(in,n,out,op); + return op; } +#undef VBZE unsigned char *TEMPLATE2(vbzdec, USIZE)(unsigned char *__restrict in, unsigned n, uint_t *__restrict out, uint_t start) { uint_t x,*op; - for(op = out; op != out+(n&~(UN-1)); ) { - TEMPLATE2(_vbget, USIZE)(in, x, ;); *op++ = (start += TEMPLATE2(zigzagdec, USIZE)(x)); - TEMPLATE2(_vbget, USIZE)(in, x, ;); *op++ = (start += TEMPLATE2(zigzagdec, USIZE)(x)); - TEMPLATE2(_vbget, USIZE)(in, x, ;); *op++ = (start += TEMPLATE2(zigzagdec, USIZE)(x)); - TEMPLATE2(_vbget, USIZE)(in, x, ;); *op++ = (start += TEMPLATE2(zigzagdec, USIZE)(x)); + #define VBZD { TEMPLATE2(_vbget, USIZE)(in, x, ;); *op++ = (start += TEMPLATE2(zigzagdec, USIZE)(x)); } + for(op = out; op != out+(n&~(UN-1)); ) { VBZD; VBZD; VBZD; VBZD; #if UN > 4 - TEMPLATE2(_vbget, USIZE)(in, x, ;); *op++ = (start += TEMPLATE2(zigzagdec, USIZE)(x)); - TEMPLATE2(_vbget, USIZE)(in, x, ;); *op++ = (start += TEMPLATE2(zigzagdec, USIZE)(x)); - TEMPLATE2(_vbget, USIZE)(in, x, ;); *op++ = (start += TEMPLATE2(zigzagdec, USIZE)(x)); - TEMPLATE2(_vbget, USIZE)(in, x, ;); *op++ = (start += TEMPLATE2(zigzagdec, USIZE)(x)); + VBZD; VBZD; VBZD; VBZD; #endif } - while(op != out+n) { - TEMPLATE2(_vbget, USIZE)(in, x, ;); *op++ = (start += TEMPLATE2(zigzagdec, USIZE)(x)); - } + while(op != out+n) VBZD; return in; } +#undef VBZD + +uint_t TEMPLATE2(vbzgetx, USIZE)(unsigned char *__restrict in, unsigned idx, uint_t start) { + unsigned char *ip; + unsigned i; + uint_t x; + for(ip = in,i = 0; i <= idx; i++) { + TEMPLATE2(_vbget, USIZE)(ip, x, ;); + start += x+1; + } + return start; +} + +unsigned TEMPLATE2(vbzgeteq, USIZE)(unsigned char **__restrict in, unsigned n, unsigned idx, uint_t key, uint_t start ) { + unsigned i; + unsigned char *ip; + uint_t x; + for(ip = *in,i=idx; i < n; i++) { + TEMPLATE2(_vbget, USIZE)(ip, x, ;); + if((start += x+1) == key) + break; + } + *in = ip; + return i; +} + #endif + +unsigned char *TEMPLATE2(VBDENC, USIZE)(uint_t *__restrict in, unsigned n, unsigned char *__restrict out, uint_t start) { + unsigned char *op = out; if(!n) return out; + uint_t *ip, b=0,v /*,x=in[0]-start-VDELTA*/; + #define VBDE { v = (*ip)-start-VDELTA; start = *ip++; TEMPLATE2(_vbput, USIZE)(op, v, ;); b |= (v /*^ x*/); } + for(ip = in; ip != in + (n&~(UN-1)); ) { VBDE;VBDE;VBDE;VBDE; + #if UN > 4 + VBDE; VBDE; VBDE; VBDE; + #endif + } + while(ip != in+n) VBDE; + if(!b) { op = out; *op++ = VB_MAX; } // if (x) { op = out; *op++ = VB_MAX-2; TEMPLATE2(_vbput, USIZE)(op, x, ;); } + #if USIZE < 64 + OVERFLOWE(in,n,out,op,VB_MAX-1); + #endif + return op; +} +#undef VBDE + +unsigned char *TEMPLATE2(VBDDEC, USIZE)(unsigned char *__restrict in, unsigned n, uint_t *__restrict out, uint_t start) { + uint_t x,*op; + if(!n) return in; + #if USIZE < 64 + OVERFLOWD(in,n,out,VB_MAX-1); + #endif + + if(in[0] == VB_MAX) { + in++; + #if defined(__SSE2__) && USIZE == 32 + #if VDELTA == 0 + if(n) BITZERO32(out, n, start); + #else + if(n) BITDIZERO32(out, n, start, VDELTA); + #endif + #else + #if VDELTA == 0 + for(x = 0; x < n; x++) out[x] = start; + #else + for(x = 0; x < n; x++) out[x] = start+x+VDELTA; + #endif + #endif + return in; + } + #if 0 //USIZE < 64 + else if(in[0] == VB_MAX-2) { in++; + uint_t z; + TEMPLATE2(_vbget, USIZE)(in, z, ;); + #if VDELTA == 0 + for(x = 0; x < n; x++) out[x] = start+z; + #else + for(x = 0; x < n; x++) out[x] = start+x+z; + #endif + return in; + } + #endif + #define VBDD { TEMPLATE2(_vbget, USIZE)(in, x, x+=VDELTA); *op++ = (start += x); } + for(op = out; op != out+(n&~(UN-1)); ) { + VBDD; VBDD; VBDD; VBDD; + #if UN > 4 + VBDD; VBDD; VBDD; VBDD; + #endif + } + while(op != out+n) VBDD; + return in; +} +#undef VBDD + +uint_t TEMPLATE2(VBDGETX, USIZE)(unsigned char *__restrict in, unsigned idx, uint_t start) { + unsigned char *ip; + unsigned i; + uint_t x; + + #if USIZE > 64 + unsigned long long u; + _vbget64(in, u, ;); x = u>>1; start += x+1; + if(u & 1) return start + ; + #endif + for(ip = in; i <= idx; i++) { + TEMPLATE2(_vbget, USIZE)(ip, x, ;); + start += x+1; + } + return start; +} + +unsigned TEMPLATE2(VBDGETGEQ, USIZE)(unsigned char **__restrict in, unsigned n, unsigned idx, uint_t *key, uint_t start ) { + unsigned i=0; + unsigned char *ip; + uint_t x; + #if USIZE < 64 + if(!idx) { + unsigned long long u; _vbget64(in, u, ;); x = u>>1; start += x+1; + if((u & 1) && start == *key) { *in = ip; return 0; } + i++; + } + #endif + for(ip = *in; i < n; i++) { + TEMPLATE2(_vbget, USIZE)(ip, x, ;); + if((start += x+VDELTA) == *key) + break; + } + *in = ip; + return i; +} #undef uint_t #endif diff --git a/vint.h b/vint.h index 98ad0b8..44d7166 100644 --- a/vint.h +++ b/vint.h @@ -1,5 +1,5 @@ /** - Copyright (C) powturbo 2013-2016 + Copyright (C) powturbo 2013-2017 GPL v2 License This program is free software; you can redistribute it and/or modify @@ -21,7 +21,7 @@ - twitter : https://twitter.com/powturbo - email : powturbo [_AT_] gmail [_DOT_] com **/ -// vint.h - "Integer Compression" variable byte include header +// "Integer Compression" variable byte include header #ifndef VINT_H #define VINT_H #include @@ -30,32 +30,37 @@ #ifdef __cplusplus extern "C" { #endif -//----------------------------------- Variable byte single value macros (low level) ----------------------------------------------- +//----------------------------------- Variable byte: single value macros (low level) ----------------------------------------------- //------------- 32 bits ------------- -#define _vbput32(_op_, _x_, _act_) {\ - if(likely(_x_ < (1<< 7))) { *_op_++ = _x_; _act_;}\ - else if(likely(_x_ < (1<<14))) { ctou16(_op_) = bswap16(_x_| 0x8000); _op_ += 2; _act_;}\ - else if(likely(_x_ < (1<<21))) { *_op_++ = _x_ >> 16 | 0xc0; ctou16(_op_) = _x_; _op_ += 2; _act_;}\ - else if(likely(_x_ < (1<<28))) { ctou32(_op_) = bswap32(_x_| 0xe0000000); _op_ += 4; _act_;}\ - else { *_op_++ = (unsigned long long)_x_ >> 32 | 0xf0; ctou32(_op_) = _x_; _op_ += 4; _act_;}\ +extern unsigned char _vtab32_[]; +#define _vbxvlen32(_x_) _vtab32_[(unsigned char)(_x_)>>4] // (clz32((_x_) ^ 0xff) - 23) // +#define _vbxlen32(_x_) ((bsr32(_x_|1)+6)/7) + +#define _vbxput32(_op_, _x_, _act_) {\ + if(likely((_x_) < (1<< 7))) { *_op_++ = _x_; _act_;}\ + else if(likely((_x_) < (1<<14))) { ctou16(_op_) = bswap16((_x_) | 0x8000u); _op_ += 2; _act_;}\ + else if(likely((_x_) < (1<<21))) { *_op_++ = _x_ >> 16 | 0xc0u; ctou16(_op_) = _x_; _op_ += 2; _act_;}\ + else if(likely((_x_) < (1<<28))) { ctou32(_op_) = bswap32((_x_) | 0xe0000000u); _op_ += 4; _act_;}\ + else { *_op_++ = (unsigned long long)(_x_) >> 32 | 0xf0u; ctou32(_op_) = _x_; _op_ += 4; _act_;}\ } -#define _vbget32(_ip_, _x_, _act_) do { _x_ = *_ip_++;\ - if(!(_x_ & 0x80)) { _act_;}\ - else if(!(_x_ & 0x40)) { _x_ = bswap16(ctou16(_ip_++-1) & 0xff3f); _act_;}\ - else if(!(_x_ & 0x20)) { _x_ = (_x_ & 0x1f)<<16 | ctou16(_ip_); _ip_ += 2; _act_;}\ - else if(!(_x_ & 0x10)) { _x_ = bswap32(ctou32(_ip_-1) & 0xffffff0f) ; _ip_ += 3; _act_;}\ - else { _x_ = (unsigned long long)(_x_ & 0x07)<<32 | ctou32(_ip_); _ip_ += 4; _act_;}\ +#define _vbxget32(_ip_, _x_, _act_) do { _x_ = (unsigned)(*_ip_++);\ + if(!(_x_ & 0x80u)) { _act_;}\ + else if(!(_x_ & 0x40u)) { _x_ = bswap16(ctou16(_ip_ - 1) & 0xff3fu); _ip_++; _act_;}\ + else if(!(_x_ & 0x20u)) { _x_ = (_x_ & 0x1f)<<16 | ctou16(_ip_); _ip_ += 2; _act_;}\ + else if(!(_x_ & 0x10u)) { _x_ = bswap32(ctou32(_ip_-1) & 0xffffff0fu); _ip_ += 3; _act_;}\ + else { _x_ = (unsigned long long)((_x_) & 0x07)<<32 | ctou32(_ip_); _ip_ += 4; _act_;}\ } while(0) -#define _vblen32(_x_) ((_x_) >= (1<<7)?((_x_) >= (1<<14)?((_x_) >= (1<<21)?((_x_) >= (1<<28)?5:4):3):2):1) -#define _vbvlen32(_x_) _vtab32_[((unsigned char)(_x_))>>4] //------------- 64 bits ----------- -#define _vbput64(_op_, _x_, _act_) {\ - if(likely(_x_ < (1<< 7))) { *_op_++ = _x_; _act_;}\ - else if(likely(_x_ < (1<<14))) { ctou16(_op_) = bswap16(_x_| 0x8000); _op_ += 2; _act_;}\ - else if(likely(_x_ < (1<<21))) { *_op_++ = _x_ >> 16 | 0xc0; ctou16(_op_) = _x_; _op_ += 2; _act_;}\ - else if(likely(_x_ < (1<<28))) { ctou32(_op_) = bswap32(_x_| 0xe0000000); _op_ += 4; _act_;}\ +#define _vbxlen64(_x_) ((bsr64(_x_)+6)/7) +#define _vbxvlen64(_x_) ((_x_)==0xff?9:clz32((_x_) ^ 0xff) - 23) + +#define _vbxput64(_op_, _x_, _act_) {\ + if(likely(_x_ < (1<< 7))) { *_op_++ = _x_; _act_;}\ + else if(likely(_x_ < (1<<14))) { ctou16(_op_) = bswap16(_x_| 0x8000); _op_ += 2; _act_;}\ + else if(likely(_x_ < (1<<21))) { *_op_++ = _x_ >> 16 | 0xc0; ctou16(_op_) = _x_; _op_ += 2; _act_;}\ + else if(likely(_x_ < (1<<28))) { ctou32(_op_) = bswap32(_x_| 0xe0000000); _op_ += 4; _act_;}\ else if( _x_ < 1ull<<35) { *_op_++ = _x_ >> 32 | 0xf0; ctou32(_op_) = _x_; _op_ += 4; _act_;}\ else if( _x_ < 1ull<<42) { ctou16(_op_) = bswap16(_x_ >> 32 | 0xf800); _op_ += 2; ctou32(_op_) = _x_; _op_ += 4; _act_;}\ else if( _x_ < 1ull<<49) { *_op_++ = _x_ >> 48 | 0xfc; ctou16(_op_) = _x_ >> 32; _op_ += 2; ctou32(_op_) = _x_; _op_ += 4; _act_;}\ @@ -63,7 +68,7 @@ extern "C" { else { *_op_++ = 0xff; ctou64(_op_) = _x_; _op_ += 8; _act_;}\ } -#define _vbget64(_ip_, _x_, _act_) do { _x_ = *_ip_++;\ +#define _vbxget64(_ip_, _x_, _act_) do { _x_ = *_ip_++;\ if(!(_x_ & 0x80)) { _act_;}\ else if(!(_x_ & 0x40)) { _x_ = bswap16(ctou16(_ip_++-1) & 0xff3f); _act_;}\ else if(!(_x_ & 0x20)) { _x_ = (_x_ & 0x1f)<<16 | ctou16(_ip_); _ip_ += 2; _act_;}\ @@ -72,119 +77,178 @@ extern "C" { else if(!(_x_ & 0x04)) { _x_ = (unsigned long long)(bswap16(ctou16(_ip_-1)) & 0x7ff) << 32 | ctou32(_ip_+1); _ip_ += 5; _act_;}\ else if(!(_x_ & 0x02)) { _x_ = (_x_ & 0x03)<<48 | (unsigned long long)ctou16(_ip_) << 32 | ctou32(_ip_+2); _ip_ += 6; _act_;}\ else if(!(_x_ & 0x01)) { _x_ = bswap64(ctou64(_ip_-1)) & 0x01ffffffffffffffull; _ip_ += 7; _act_;}\ - else { _x_ = ctou64(_ip_); _ip_ += 8; _act_;}\ + else { _x_ = ctou64(_ip_); _ip_ += 8; _act_;}\ } while(0) +#define vbxput64(_op_, _x_) { unsigned long long _x = _x_; _vbxput64(_op_, _x, ;); } +#define vbxput32(_op_, _x_) { register unsigned _x = _x_; _vbxput32(_op_, _x, ;); } +#define vbxput16(_op_, _x_) vbxput32(_op_, _x_) -#define _vbput640(_op_, _x_, _act_) {\ - if(_x_ < 1 << 7) { *_op_++ = _x_ << 1; _act_;}\ - else if(_x_ < 1 <<14) { ctou16(_op_) = _x_ << 2 | 0x01; _op_ += 2; _act_;}\ - else if(_x_ < 1 <<21) { ctou16(_op_) = _x_ << 3 | 0x03; _op_ += 2; *_op_++ = _x_ >> 13; _act_;}\ - else if(_x_ < 1 <<28) { ctou32(_op_) = _x_ << 4 | 0x07; _op_ += 4; _act_;}\ - else if(_x_ < 1ull<<35) { ctou32(_op_) = _x_ << 5 | 0x0f; _op_ += 4; *_op_++ = _x_ >> 27; _act_;}\ - else if(_x_ < 1ull<<42) { ctou32(_op_) = _x_ << 6 | 0x1f; _op_ += 4; ctou16(_op_) = _x_ >> 26; _op_+=2; _act_;}\ - else if(_x_ < 1ull<<49) { ctou32(_op_) = _x_ << 7 | 0x3f; _op_ += 4; ctou16(_op_) = _x_ >> 25; *(_op_+2) = _x_ >> 41; _op_+=3; _act_;}\ - else if(_x_ < 1ull<<56) { ctou64(_op_) = _x_ << 8 | 0x7f; _op_ += 8; _act_;}\ - else { *_op_++ = 0xff; ctou64(_op_) = _x_; _op_+=8; _act_;}\ +#define vbxget64(_ip_, _x_) _vbxget64(_ip_, _x_, ;) +#define vbxget32(_ip_, _x_) _vbxget32(_ip_, _x_, ;) +#define vbxget16(_ip_, _x_) vbxget32(_ip_,_x_) +//--------------------------------------------------------------------------- +#define VB_SIZE 64 +#define VB_MAX 254 +#define VB_B2 6 +#define VB_B3 3 +#define VB_BA3 (VB_MAX - (VB_SIZE/8 - 3)) +#define VB_BA2 (VB_BA3 - (1<> 8); *_op_++ = (_x_);*/ _act_; }\ + else if ((_x_) < VB_OFS3) { *_op_++ = VB_BA2 + (((_x_) -= VB_OFS2) >> 16); ctou16(_op_) = (_x_); _op_ += 2; _act_;}\ + else { unsigned _b = (bsr32((_x_))+7)/8; *_op_++ = VB_BA3 + (_b - 3); ctou32(_op_) = (_x_); _op_ += _b; _act_;}\ } -#define _vbget640(_ip_, _x_, _act_) do {\ - if(!((_x_ = *_ip_) & 1<<0)) { _ip_++; _x_ >>= 1; _act_;}\ - else if(!(_x_ & 1<<1)) { _x_ = ctou16(_ip_) >> 2; _ip_ += 2; _act_;}\ - else if(!(_x_ & 1<<2)) { _x_ = ctou16(_ip_) >> 3 | *(_ip_+2) << 13; _ip_ += 3; _act_;}\ - else if(!(_x_ & 1<<3)) { _x_ = ctou32(_ip_) >> 4; _ip_ += 4; _act_;}\ - else if(!(_x_ & 1<<4)) { _x_ = ctou32(_ip_) >> 5 | (unsigned long long)_ip_[4] << 27; _ip_ += 5; _act_;}\ - else if(!(_x_ & 1<<5)) { _x_ = ctou32(_ip_) >> 6 | (unsigned long long)ctou16(_ip_+4) << 26; _ip_ += 6; _act_;}\ - else if(!(_x_ & 1<<6)) { _x_ = ctou32(_ip_) >> 7 | (unsigned long long)ctou16(_ip_+4) << 25 | (unsigned long long)(_ip_[6]) << 41; _ip_ += 7; _act_;}\ - else if(!(_x_ & 1<<7)) { _x_ = ctou64(_ip_) >> 8; _ip_ += 8; _act_;}\ - else { _x_ = ctou64(_ip_+1); _ip_ += 9; _act_;}\ +#define _vbget32(_ip_, _x_, _act_) do { _x_ = *_ip_++;\ + if(likely(_x_ < VB_OFS1)) { _act_ ;}\ + else if(likely(_x_ < VB_BA2)) { _x_ = /*bswap16(ctou16(_ip_-1))*/ ((_x_<<8) + (*_ip_)) + (VB_OFS1 - (VB_OFS1 << 8)); _ip_++; _act_;} \ + else if(likely(_x_ < VB_BA3)) { _x_ = ctou16(_ip_) + ((_x_ - VB_BA2 ) << 16) + VB_OFS2; _ip_ += 2; _act_;}\ + else { unsigned _b = _x_-VB_BA3; _x_ = ctou32(_ip_) & ((1u << 8 * _b << 24) - 1); _ip_ += 3 + _b; _act_;}\ } while(0) +#define _vblen64(_x_) _vblen32(_x_) +#define _vbvlen64(_x_) _vbvlen32(_x_) +#define _vbput64(_op_, _x_, _act_) _vbput32(_op_, _x_, _act_) +#define _vbget64(_ip_, _x_, _act_) _vbget32(_ip_, _x_, _act_) + +#ifdef _WIN32 +//#define fgetc_unlocked(_f_) _fgetc_nolock(_f_) +#define fputc_unlocked(_c_, _f_) fputc(_c_,_f_) +#define fgetc_unlocked(_f_) fgetc(_f_) +#else +#define fputc_unlocked(_c_, _f_) _IO_putc_unlocked(_c_,_f_) +#define fgetc_unlocked(_f_) _IO_getc_unlocked(_f_) +#endif -#define _vblen64(_x_) ((_x_) >= (1<<7)?((_x_) >= (1<<14)?((_x_) >= (1<<21)?((_x_) >= (1<<28)?((_x_) >= (1ull<<35)?((_x_) >= (1ull<<42)?((_x_) >= (1ull<<49)?((_x_) >= (1ull<<56)?9:8):7):6):5):4):3):2):1) -#define _vbvlen64(_x_) _vtab64_[_x_] +#define leb128put(_op_, _x_) { typeof(_x_) _x = _x_; while(_x > 0x7f) { *_op_++ = _x & 0x7f; _x >>= 7; } *_op_++ = _x | 0x80; } +#define vbfput32(_f_, _x_) ({ typeof(_x_) _x = _x_; while(_x > 0x7f) { fputc_unlocked(_x & 0x7f, _f_); _x >>= 7; } fputc_unlocked(_x | 0x80, _f_); }) + +#define _leb128get(_ip_, _x_, _act_) { unsigned _sft=0; for(_x_=0;;_sft += 7) { unsigned _c = *_ip_++; _x_ += (_c & 0x7f) << _sft; if(_c >= 0x80) { _act_; break; } } } +#define leb128get(_ip_, _x_) vbgetax(_ip_, _x_, ;) +#define vbfget32(_f_ ) ({ unsigned _sft=0,_x=0; for(;;_sft += 7) { unsigned _c = fgetc_unlocked(_f_); if(_c != EOF) { _x += (_c & 0x7f) << _sft; if(_c & 0x80) break; } else { _x = EOF; break; } } _x; }) //------------- 16 bits ----------- +#define _vblen16(_x_) _vblen32(_x_) +#define _vbvlen16(_x_) _vbvlen32(_x_) + #define _vbput16(_op_, _x_, _act_) _vbput32(_op_, _x_, _act_) #define _vbget16(_ip_, _x_, _act_) _vbget32(_ip_, _x_, _act_) -//------------- 15 bits ----------- -#define _vbput15(_op_, _x_, _act_) do { if(likely((_x_) < 0x80)) { *_op_++ = _x_; _act_; } else { *_op_++ = (_x_) >> 8 | 0x80; *_op_++ = _x_; } } while(0) -#define _vbget15(_ip_, _x_, _act_) do { if(!((_x_ = *_ip_++) & 0x80)) _x_ = (_x_ & 0x7f) << 8 | *_ip_++; _act_; } while(0) -#define _vblen15(_x_) ((_x_) >= 0x80?2:1) - -//----------------------------- Variable byte functions ----------------------------------------------------------------------- -// ---- Variable byte length after compressing value _x_ -static inline unsigned vblen64(uint64_t x) { return _vblen64(x); } +//----------------------------------- Variable byte: single value functions ----------------------------------------------- +// ---- Variable byte length after compression +static inline unsigned vblen16(unsigned short x) { return _vblen16(x); } static inline unsigned vblen32(unsigned x) { return _vblen32(x); } -static inline unsigned vblen16(unsigned short x) { return _vblen32(x); } -static inline unsigned vblen15(unsigned short x) { return _vblen15(x); } +static inline unsigned vblen64(uint64_t x) { return _vblen64(x); } -// ---- Length of compressed value. Input in is the compressed buffer start -static inline unsigned vbvlen64(unsigned char *in) { return in[0]==0xff?9:clz32(in[0] ^ 0xff) - 23; } -static inline unsigned vbvlen32(unsigned char *in) { return clz32((in[0] ^ 0xff) | 0x08) - 23; } -#define vbvlen16(p) vbvlen32(p) -static inline unsigned vbvlen15(unsigned char *in) { return (in[0] >> 7)+1; } +// ---- Length of compressed value. Input in is the first char of the compressed buffer start (Ex. vbvlen32(in[0]) ) +static inline unsigned vbvlen16(unsigned x) { return _vbvlen32(x); } +static inline unsigned vbvlen32(unsigned x) { return _vbvlen32(x); } +static inline unsigned vbvlen64(unsigned x) { return _vbvlen64(x); } //----- encode/decode 16/32/64 single value and advance output/input pointer #define vbput64(_op_, _x_) { unsigned long long _x = _x_; _vbput64(_op_, _x, ;); } #define vbput32(_op_, _x_) { register unsigned _x = _x_; _vbput32(_op_, _x, ;); } #define vbput16(_op_, _x_) vbput32(_op_, _x_) -#define vbput15(_op_, _x_) { unsigned _x = _x_; _vbput15(_op_, _x, ;); } -#define vbget64(_ip_) ({ unsigned long long _x; _vbget64(_ip_, _x, ;); _x; }) -#define vbget32(_ip_) ({ register unsigned _x; _vbget32(_ip_, _x, ;); _x; }) -#define vbget16(_ip_) vbget32(_ip_) -#define vbget15(_ip_) ({ unsigned _x; _vbget15(_ip_, _x, ;); _x; }) +#define vbget64(_ip_, _x_) _vbget64(_ip_, _x_, ;) +#define vbget32(_ip_, _x_) _vbget32(_ip_, _x_, ;) +#define vbget16(_ip_, _x_) vbget32(_ip_,_x_) -// ---- Encode array with n integer values to the buffer out. Return value = end of compressed output buffer out -unsigned char *vbenc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out); -unsigned char *vbenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out); +//----------------------------- Variable byte: array functions ----------------------------------------------------------------------- +// Encoding/DEcoding: Return value = end of compressed output/input buffer out/in + +//----------------------- Encoding/Decoding unsorted array with n integer values -------------------------- unsigned char *vbenc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out); -unsigned char *vbenc15( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out); // 15 bits range (0-0x7fff) +unsigned char *vbenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out); +unsigned char *vbenc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out); -//----- Decode Return value = end of compressed input buffer in -unsigned char *vbdec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out); -unsigned char *vbdec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out); +//-- Decode unsigned char *vbdec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out); -unsigned char *vbdec15( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out); +unsigned char *vbdec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out); +unsigned char *vbdec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out); -//----- Delta encoding for increasing integer lists. Return value = end of compressed output buffer out -unsigned char *vbdenc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start); -unsigned char *vbdenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); +//-- Get value stored at index idx (idx:0...n-1) +unsigned short vbgetx16( unsigned char *__restrict in, unsigned idx); +unsigned vbgetx32( unsigned char *__restrict in, unsigned idx); +uint64_t vbgetx64( unsigned char *__restrict in, unsigned idx); + +//-- Search and return index of next value equal to key or n when no key value found +// ex. unsigned idx;unsigned char *ip; for(idx=0,ip=in;;) { if((idx = vgeteq32(&ip, idx, 4321))>=n) break; printf("found at %u ", idx); } +unsigned vbgeteq16( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned short key); +unsigned vbgeteq32( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned key); +unsigned vbgeteq64( unsigned char **__restrict in, unsigned n, unsigned idx, uint64_t key); + +//---------------------- Delta encoding/decoding sorted array --------------------------------------------- +//-- Increasing integer array. out[i] = out[i-1] + in[i] unsigned char *vbdenc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start); -unsigned char *vbdenc15( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start); +unsigned char *vbdenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); +unsigned char *vbdenc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start); -//----- Delta decode Return value = end of compressed input buffer in -unsigned char *vbddec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start); -unsigned char *vbddec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); unsigned char *vbddec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start); -unsigned char *vbddec15( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start); +unsigned char *vbddec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); +unsigned char *vbddec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start); -//----- Delta encoding for strictly increasing (never remaining constant or decreasing) integer lists. Return value = end of compressed output buffer out -unsigned char *vbd1enc64(uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start); -unsigned char *vbd1enc32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); +//-- Get value stored at index idx (idx:0...n-1) +unsigned short vbdgetx16( unsigned char *__restrict in, unsigned idx, unsigned short start); +unsigned vbdgetx32( unsigned char *__restrict in, unsigned idx, unsigned start); +uint64_t vbdgetx64( unsigned char *__restrict in, unsigned idx, uint64_t start); + +//-- Search and return index of next value equal to key or n when no key value found +// ex. unsigned idx;unsigned char *ip; for(idx=0,ip=in;;) { if((idx = vgeteq32(&ip, idx, 4321))>=n) break; printf("found at %u ", idx); } +unsigned vbdgetgeq16( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned short *key, unsigned short start); +unsigned vbdgetgeq32( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned *key, unsigned start); +unsigned vbdgetgeq64( unsigned char **__restrict in, unsigned n, unsigned idx, uint64_t *key, uint64_t start); + +//-- Strictly increasing (never remaining constant or decreasing) integer array. out[i] = out[i-1] + in[i] + 1 unsigned char *vbd1enc16(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start); -unsigned char *vbd1enc15(unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start); +unsigned char *vbd1enc32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); +unsigned char *vbd1enc64(uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start); -//----- Delta decode Return value = end of compressed input buffer in -unsigned char *vbd1dec64(unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start); -unsigned char *vbd1dec32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); unsigned char *vbd1dec16(unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start); -unsigned char *vbd1dec15(unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start); +unsigned char *vbd1dec32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); +unsigned char *vbd1dec64(unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start); -//----- Zigzag encoding for unsorted integer lists. Return value = end of compressed output buffer out -unsigned char *vbzenc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start); -unsigned char *vbzenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); +//-- Get value stored at index idx (idx:0...n-1) +unsigned short vbd1getx16( unsigned char *__restrict in, unsigned idx, unsigned short start); +unsigned vbd1getx32( unsigned char *__restrict in, unsigned idx, unsigned start); +uint64_t vbd1getx64( unsigned char *__restrict in, unsigned idx, uint64_t start); + +//-- Search and return index of next value equal to key or n when no key value found +// ex. unsigned idx;unsigned char *ip; for(idx=0,ip=in;;) { if((idx = vgeteq32(&ip, idx, 4321))>=n) break; printf("found at %u ", idx); } +unsigned vbd1getgeq16( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned short *key, unsigned short start); +unsigned vbd1getgeq32( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned *key, unsigned start); +unsigned vbd1getgeq64( unsigned char **__restrict in, unsigned n, unsigned idx, uint64_t *key, uint64_t start); + +//---------------------- Zigzag encoding/decoding for unsorted integer lists. unsigned char *vbzenc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start); -unsigned char *vbzenc15( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned short start); +unsigned char *vbzenc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start); +unsigned char *vbzenc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, uint64_t start); -//----- Zigzag decode Return value = end of compressed input buffer in -unsigned char *vbzdec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start); -unsigned char *vbzdec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); unsigned char *vbzdec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start); -unsigned char *vbzdec15( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned short start); +unsigned char *vbzdec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); +unsigned char *vbzdec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, uint64_t start); +//-- Get value stored at index idx (idx:0...n-1) +unsigned short vbzgetx16( unsigned char *__restrict in, unsigned idx, unsigned short start); +unsigned vbzgetx32( unsigned char *__restrict in, unsigned idx, unsigned start); +uint64_t vbzgetx64( unsigned char *__restrict in, unsigned idx, uint64_t start); + +//-- Search and return index of next value equal to key or n when no key value found +// ex. unsigned idx;unsigned char *ip; for(idx=0,ip=in;;) { if((idx = vgeteq32(&ip, idx, 4321))>=n) break; printf("found at %u ", idx); } +/*unsigned vbzgeteq15( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned short key, unsigned start); +unsigned vbzgeteq16( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned short key, unsigned start); +unsigned vbzgeteq32( unsigned char **__restrict in, unsigned n, unsigned idx, unsigned key, unsigned start); +unsigned vbzgeteq64( unsigned char **__restrict in, unsigned n, unsigned idx, uint64_t key, unsigned start);*/ #ifdef __cplusplus } #endif diff --git a/vp4c.c b/vp4c.c new file mode 100644 index 0000000..e0a770f --- /dev/null +++ b/vp4c.c @@ -0,0 +1,236 @@ +/** + Copyright (C) powturbo 2013-2017 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - homepage : https://sites.google.com/site/powturbo/ + - github : https://github.com/powturbo + - twitter : https://twitter.com/powturbo + - email : powturbo [_AT_] gmail [_DOT_] com +**/ +// "Integer Compression" Turbo PFor/PforDelta + #ifndef USIZE +#include + +#include "conf.h" +#include "bitpack.h" +#include "vp4c.h" +#include "vint.h" +#include "vsimple.h" + +#define PAD8(_x_) ( (((_x_)+8-1)/8) ) +//------------------------------------------ +#define VSIZE 256 + +#define EXCEP 0 // Hybrid TurboPFor : 1=Variable byte 2=Vsimple +#define _P4BITS _p4bits +//-- Scalar +#define _P4ENC _p4enc +#define P4ENC p4enc +//#define P4NENC p4dnenc +#define BITPACK bitpack + +#define USIZE 16 +#include __FILE__ +#undef USIZE + +#define USIZE 32 +#include __FILE__ +#undef USIZE + +#define USIZE 64 +#include __FILE__ +#undef USIZE + +#define EXCEP 0 // Direct access +#define _P4BITS _p4bitsx +#define _P4ENC _p4encx +#define P4ENC p4encx + +#define USIZE 16 +#include __FILE__ +#undef USIZE + +#define USIZE 32 +#include __FILE__ +#undef USIZE + +#undef _P4ENC +#undef P4ENC +#undef BITPACK + +#undef _P4BITS + +#define EXCEP 1 // +//-- SIMD: Vertical bitpacking +#define P4SIMD +#define _P4ENC _p4enc128v +#define P4ENC p4enc128v +//#define P4NENC p4dnencv +#define BITPACK bitpack128v +#define USIZE 32 +#include __FILE__ +#undef _P4ENC +#undef P4ENC +#undef BITPACK + +#ifdef __AVX2__ +#define _P4ENC _p4enc256v +#define P4ENC p4enc256v +#define BITPACK bitpack256v +#define USIZE 32 +#include __FILE__ +#undef _P4ENC +#undef P4ENC +#undef BITPACK +#endif + +#undef USIZE + #else +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wparentheses" + +#define uint_t TEMPLATE3(uint, USIZE, _t) + +#define VSC(a) + #ifdef _P4BITS +unsigned TEMPLATE2(_P4BITS, USIZE)(uint_t *__restrict in, unsigned n, unsigned *pbx) { + uint_t *ip; int b=0,r; int i,ml,l; + unsigned x, bx, cnt[USIZE+1] = {0}, _vb[USIZE*2+5] = {0}, *vb=&_vb[USIZE],fx=0, b64=(n+7)/8; + #if EXCEP >= 2 + unsigned long long smap[USIZE+1][VSIZE/64]={0},xmap[VSIZE/64]={0}; unsigned c; + #endif + + #define CNTE { ++cnt[r=TEMPLATE2(bsr, USIZE)(*ip)]; VSC(c = ip-in; smap[r][c>>6] |= (1ull << (c&0x3f))); b |= *ip++; } + for(ip = in; ip != in+(n&~3); ) { CNTE; CNTE; CNTE; CNTE; } + while(ip != in+n) CNTE; + + b = TEMPLATE2(bsr, USIZE)(b); + bx = b; + ml = PAD8(n*b)+1; x = cnt[b]; + #if EXCEP == 1 +#define VBB(_x_,_b_) vb[_b_-7]+=_x_; vb[_b_-15]+=_x_*2; vb[_b_-19]+=_x_*3; vb[_b_-25]+=_x_*4; + int vv = x; VBB(x,b); + #elif EXCEP == 2 + for(c = 0; c < (n+63)/64;c++) xmap[c] = smap[b][c]; + #else + ml -= 2+b64; + #endif + for(i = b-1; i >= 0; --i) { + #if EXCEP == 1 + l = PAD8(n*i) + 2+b64 + PAD8(x*(bx-i)); + int v = PAD8(n*i) + 2 + x + vv, vx = 0; + x += cnt[i]; vv+=cnt[i]+vb[i]; VBB(cnt[i],i); + if(v < l) l=v,vx=1; if(unlikely(l < ml)) ml=l,b=i,fx=vx; + #elif EXCEP == 2 + uint_t tin[VSIZE],*tp=tin; + for(ip=in,c = 0; c < (n+63)/64;c++,ip+=64) { + unsigned long long z = xmap[c]; + while(z) { unsigned x = ctz64(z); *tp++ = ip[x]; z ^= (1ull< 0 + *pbx = fx?(USIZE+1):(bx - b); + #else + *pbx = bx - b; + #endif + return b; +} + #endif + +unsigned char *TEMPLATE2(_P4ENC, USIZE)(uint_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx) { + if(!bx) + return TEMPLATE2(BITPACK, USIZE)(in, + #ifndef P4SIMD + n, + #endif + out, b); + + uint_t msk = (1ull << b)-1,_in[VSIZE], inx[VSIZE*2]; + unsigned long long xmap[VSIZE/64] = { 0 }; + unsigned miss[VSIZE],i, xn, c; + #define MISS { miss[xn] = i; xn += in[i] > msk; _in[i] = in[i] & msk; i++; } + for(xn = i = 0; i != n&~3; ) { MISS; MISS; MISS; MISS; } + while(i != n) MISS; + + for(i = 0; i != xn; ++i) { + c = miss[i]; + xmap[c>>6] |= (1ull << (c&0x3f)); + inx[i] = in[c] >> b; + } + unsigned char *_out = out; + #if EXCEP > 0 + if(bx <= USIZE) { + #endif + for(i = 0; i < (n+63)/64; i++) ctou64(out+i*8) = xmap[i]; out += PAD8(n); + out = TEMPLATE2(bitpack, USIZE)(inx, xn, out, bx); + out = TEMPLATE2(BITPACK, USIZE)(_in, + #ifndef P4SIMD + n, + #endif + out, b); + #if EXCEP > 0 + } + else { + *out++ = xn; + out = TEMPLATE2(BITPACK, USIZE)(_in, + #ifndef P4SIMD + n, + #endif + out, b); + out = TEMPLATE2(vbenc, USIZE)(inx, xn, out); + for(i = 0; i != xn; ++i) *out++ = miss[i]; + } + #endif + return out; +} + +unsigned char *TEMPLATE2(P4ENC, USIZE)(uint_t *__restrict in, unsigned n, unsigned char *__restrict out) { + unsigned bx, b = TEMPLATE2(_p4bits, USIZE)(in, n, &bx); + #if EXCEP > 0 + if(bx <= USIZE) { P4SAVE(out, b, bx); } else *out++= 0x80|b<<1; + #else + P4SAVE(out, b, bx); + #endif + return TEMPLATE2(_P4ENC, USIZE)(in, n, out, b, bx); +} + +/*unsigned char *TEMPLATE2(P4NENC, USIZE)(uint_t *__restrict in, unsigned n, unsigned char *__restrict out) { + uint_t *ip; + for(ip = in; ip < in+n; ip += VSIZE) { unsigned l = (in+n) - ip; l = min(l, VSIZE); + out = TEMPLATE2(P4ENC, USIZE)(ip, l, out);; + } +}*/ +#pragma clang diagnostic pop + #endif +/* +111 : 32 bits = bitpack b=32 if bits=0x1f EOB +000 : bitpack = bitpack no exp b=0..5 +001 : vbyte b=0..5 +010 : exp equ. b=0..5 b,bitpack +011 : bp. equal b=0..5 b,exp. +1XX : EOB +*/ + diff --git a/vp4c.h b/vp4c.h new file mode 100644 index 0000000..6a6fe55 --- /dev/null +++ b/vp4c.h @@ -0,0 +1,56 @@ +/** + Copyright (C) powturbo 2013-2017 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - homepage : https://sites.google.com/site/powturbo/ + - github : https://github.com/powturbo + - twitter : https://twitter.com/powturbo + - email : powturbo [_AT_] gmail [_DOT_] com +**/ +// vp4dc.h - "Integer Compression" TurboPfor (see vp4dd.h for decompression) + +#define P4EB(_b_) (_b_ << 1) +#define P4EBX(_b_, _bx_) (_bx_ << 8 | _b_ << 1 | 1) +#define P4SAVE(_out_, _b_, _bx_) do { if(!_bx_) *_out_++ = P4EB(_b_);else *(unsigned short *)_out_ = P4EBX(_b_, _bx_), _out_ += 2; } while(0) + +#ifdef __cplusplus +extern "C" { +#endif + +// compress integer array with n values to the buffer out. Return value = end of compressed buffer out +unsigned char *p4enc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out); +unsigned char *p4enc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out); +unsigned char *p4enc128v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out); // SIMD (Vertical bitpacking) +unsigned char *p4enc256v32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out); // SIMD (Vertical bitpacking) +unsigned char *p4enc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out); + +unsigned char *p4encx16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out);// Direct access +unsigned char *p4encx32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out); + +// same as p4denc, but with b and bx as parameters. Call after _p4d16,_p4d32,_p4d64 +unsigned char *_p4enc16( unsigned short *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx); +unsigned char *_p4enc32( unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx); +unsigned char *_p4enc128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx); // SIMD (Vertical bitpacking) +unsigned char *_p4enc256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx); // SIMD (Vertical bitpacking) +unsigned char *_p4enc64( uint64_t *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b, unsigned bx); +// calculate the best bit sizes b and bx, return b. +unsigned _p4bits16( unsigned short *__restrict in, unsigned n, unsigned *pbx); +unsigned _p4bits32( unsigned *__restrict in, unsigned n, unsigned *pbx); +unsigned _p4bits64( uint64_t *__restrict in, unsigned n, unsigned *pbx); +#ifdef __cplusplus +} +#endif diff --git a/vp4d.c b/vp4d.c new file mode 100644 index 0000000..08c13ef --- /dev/null +++ b/vp4d.c @@ -0,0 +1,316 @@ +/** + Copyright (C) powturbo 2013-2017 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - homepage : https://sites.google.com/site/powturbo/ + - github : https://github.com/powturbo + - twitter : https://twitter.com/powturbo + - email : powturbo [_AT_] gmail [_DOT_] com +**/ +// "Integer Compression" TurboPFor - Pfor/PforDelta +#ifndef USIZE +#include + +#include "conf.h" +#include "bitunpack.h" +#include "bitutil.h" +#include "vp4d.h" +#include "vint.h" +#include "vsimple.h" +#define PAD8(__x) ( (((__x)+8-1)/8) ) +#define VSIZEX 256 + + #if 0 //defined(__AVX_2__) +#include "avx2.h" + #elif defined(__SSSE3__) +#include +static ALIGNED(char, shuffles[16][16], 16) = { + #define _ 0x80 + { _,_,_,_, _,_,_,_, _,_, _, _, _, _, _,_ }, + { 0,1,2,3, _,_,_,_, _,_, _, _, _, _, _,_ }, + { _,_,_,_, 0,1,2,3, _,_, _, _, _, _, _,_ }, + { 0,1,2,3, 4,5,6,7, _,_, _, _, _, _, _,_ }, + { _,_,_,_, _,_,_,_, 0,1, 2, 3, _, _, _,_ }, + { 0,1,2,3, _,_,_,_, 4,5, 6, 7, _, _, _,_ }, + { _,_,_,_, 0,1,2,3, 4,5, 6, 7, _, _, _,_ }, + { 0,1,2,3, 4,5,6,7, 8,9,10,11, _, _, _,_ }, + { _,_,_,_, _,_,_,_, _,_,_,_, 0, 1, 2, 3 }, + { 0,1,2,3, _,_,_,_, _,_,_, _, 4, 5, 6, 7 }, + { _,_,_,_, 0,1,2,3, _,_,_, _, 4, 5, 6, 7 }, + { 0,1,2,3, 4,5,6,7, _,_, _, _, 8, 9,10,11 }, + { _,_,_,_, _,_,_,_, 0,1, 2, 3, 4, 5, 6, 7 }, + { 0,1,2,3, _,_,_,_, 4,5, 6, 7, 8, 9,10,11 }, + { _,_,_,_, 0,1,2,3, 4,5, 6, 7, 8, 9,10,11 }, + { 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 }, + #undef _ +}; + #endif + + +#define USIZE 64 + +#define _P4DEC _p4dec +#define P4DEC p4dec +#define BITUNPACK bitunpack // unpack only +#define BITUNPACKD bitunpack // integrated unpack +#define _BITUNPACKD bitunpack // integrated +#define P4START +#include __FILE__ + +#define P4DECX + +#define USIZE 16 +#include __FILE__ + +#define USIZE 32 +#include __FILE__ +#undef P4DECX +#undef P4DECX + +#define P4START start, +#define P4START_T +#define _P4DEC _p4ddec //delta0 +#define P4DEC p4ddec +#define BITUNPACKD bitdunpack +#define _BITUNPACKD _bitdunpack +#define BITUNDD bitund +#include __FILE__ + +#define _P4DEC _p4d1dec //delta1 +#define P4DEC p4d1dec +#define BITUNPACKD bitd1unpack +#define _BITUNPACKD bitd1unpack +#define BITUNDD bitund1 +#include __FILE__ + +#undef _P4DEC +#undef P4DEC +#undef BITUNPACK +#undef BITUNDD + +// SIMD ------------- +#define P4START +#undef P4START_T +#define VSIZE 128 +#define _P4DEC _p4dec128v +#define P4DEC p4dec128v +#define BITUNPACK bitunpack128v +#define BITUNPACKD bitunpack128v +#define _BITUNPACKD _bitunpack128v +#include __FILE__ + +#define P4START start, +#define P4START_T +#define _P4DEC _p4ddec128v +#define P4DEC p4ddec128v +#define BITUNPACKD bitdunpack128v +#define _BITUNPACKD _bitdunpack128v +#define BITUNDD bitund +#include __FILE__ + +#define _P4DEC _p4d1dec128v +#define P4DEC p4d1dec128v +#define BITUNPACKD bitd1unpack128v +#define _BITUNPACKD _bitd1unpack128v +#define BITUNDD bitund1 +#include __FILE__ +#undef BITUNDD + + #ifdef __AVX2__ +#define P4START +#undef P4START_T +#define VSIZE 256 +#define _P4DEC _p4dec256v +#define P4DEC p4dec256v +#define BITUNPACK bitunpack256v +#define BITUNPACKD bitunpack256v +#define _BITUNPACKD _bitunpack256v +#include __FILE__ + +#define P4START start, +#define P4START_T +#define _P4DEC _p4ddec256v +#define P4DEC p4ddec256v +#define BITUNPACKD bitdunpack256v +#define _BITUNPACKD _bitdunpack256v +#define BITUNDD bitund +#include __FILE__ + +#define _P4DEC _p4d1dec256v +#define P4DEC p4d1dec256v +#define BITUNPACKD bitd1unpack256v +#define _BITUNPACKD _bitd1unpack256v +#define BITUNDD bitund1 +#include __FILE__ +#undef BITUNDD + #endif + +#else +#define uint_t TEMPLATE3(uint, USIZE, _t) + +//#pragma GCC push_options +//#pragma GCC optimize ("unroll-loops") + #ifdef P4START_T +unsigned char *TEMPLATE2(_P4DEC, USIZE)(unsigned char *__restrict in, unsigned n, uint_t *__restrict out, uint_t start, unsigned b, unsigned bx ) { + #else +unsigned char *TEMPLATE2(_P4DEC, USIZE)(unsigned char *__restrict in, unsigned n, uint_t *__restrict out, unsigned b, unsigned bx ) { + #endif + uint_t ex[VSIZEX+8]; + if(!(b & 1)) return TEMPLATE2(BITUNPACKD, USIZE)(in, + #ifndef VSIZE + n, + #endif + out, P4START b>>1); + + b >>= 1; + #ifdef VSIZE + unsigned char *pb = in; + #if VSIZE == 128 + in = TEMPLATE2(bitunpack, USIZE)(in+16, popcnt64(ctou64(in)) + popcnt64(ctou64(in+8)), ex, bx); + #else + in = TEMPLATE2(bitunpack, USIZE)(in+32, popcnt64(ctou64(in)) + popcnt64(ctou64(in+8)) + popcnt64(ctou64(in+16)) + popcnt64(ctou64(in+24)), ex, bx); + #endif + return TEMPLATE2(_BITUNPACKD, USIZE)(in, out, P4START b, ex, pb); + #else + unsigned long long bb[VSIZEX/64]; unsigned num=0,i,p4dn = (n+63)/64; + for(i = 0; i < n/64; i++) { bb[i] = ctou64(in+i*8); num += popcnt64(bb[i]); } + if(n & 0x3f) { bb[i] = ctou64(in+i*8) & ((1ull<<(n&0x3f))-1); num += popcnt64(bb[i]); } + in = TEMPLATE2(bitunpack, USIZE)(in+PAD8(n), num, ex, bx); + in = TEMPLATE2(bitunpack, USIZE)(in, + #ifndef VSIZE + n, + #endif + out, b); + #if 0 //defined(AVX_2__) + uint_t *op,*pex = ex; + for(i = 0; i < p4dn; i++) { + for(op = out; bb[i]; bb[i] >>= 8,op += 8) { unsigned m = (unsigned char)bb[i], mc=popcnt32(m), s = pex[mc]; pex[mc]=0; + _mm256_storeu_si256((__m256i *)op, _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)op), mm256_maskz_expand_epi32(m,_mm256_slli_epi32(_mm256_load_si256((const __m256i*)pex), b)))); pex += mc; *pex=s; + } //out += 64; + } + #elif defined(__SSSE3__) && USIZE == 32 + uint_t *_op=out,*op,*pex = ex; + for(i = 0; i < p4dn; i++) { + for(op=_op; bb[i]; bb[i] >>= 4,op+=4) { const unsigned m = bb[i]&0xf; + _mm_storeu_si128((__m128i *)op, _mm_add_epi32(_mm_loadu_si128((__m128i*)op), _mm_shuffle_epi8(_mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), b), _mm_load_si128((__m128i*)shuffles[m]) ) )); pex += popcnt32(m); + } _op+=64; + } + #else + unsigned k = 0; uint_t *_op=out,*op; + for(op=_op,i = 0; i < p4dn; i++) { + while(bb[i]) { unsigned x = ctz64(bb[i]); op[x] += ex[k++]<>1; + bx = *in++; + in = TEMPLATE2(BITUNPACK, USIZE)(in, + #ifndef VSIZE + n, + #endif + out, b); + in = TEMPLATE2(vbdec, USIZE)(in, bx, ex); + for(i = 0; i != (bx & ~3); i += 4) { + out[in[i ]] |= ex[i ] << b; + out[in[i+1]] |= ex[i+1] << b; + out[in[i+2]] |= ex[i+2] << b; + out[in[i+3]] |= ex[i+3] << b; + } + for(;i < bx; i++) + out[in[i]] |= ex[i] << b; + in += bx; + #ifdef BITUNDD + TEMPLATE2(BITUNDD, USIZE)(out, n, start); + #endif + return in; + } +} + + #ifdef P4DECX +unsigned char *TEMPLATE2(p4decx, USIZE)(unsigned char *in, unsigned n, uint_t *__restrict out) { + unsigned b,i; + struct p4 p4; + p4ini(&p4, &in, n, &b); + + if(unlikely(p4.i & 1)) { + for(i = 0; i != n&~3; i+=4) { + out[i ] = TEMPLATE2(p4getx, USIZE)(&p4, in, i , b); + out[i+1] = TEMPLATE2(p4getx, USIZE)(&p4, in, i+1, b); + out[i+2] = TEMPLATE2(p4getx, USIZE)(&p4, in, i+2, b); + out[i+3] = TEMPLATE2(p4getx, USIZE)(&p4, in, i+3, b); + } + for( ; i != n; i++) + out[i ] = TEMPLATE2(p4getx, USIZE)(&p4, in, i , b); + } else { + for(i = 0; i != n&~3; i+=4) { + out[i ] = TEMPLATE2(_bitgetx, USIZE)(in, i *b, b); + out[i+1] = TEMPLATE2(_bitgetx, USIZE)(in, (i+1)*b, b); + out[i+2] = TEMPLATE2(_bitgetx, USIZE)(in, (i+2)*b, b); + out[i+3] = TEMPLATE2(_bitgetx, USIZE)(in, (i+3)*b, b); + } + for( ; i != n; i++) + out[i ] = TEMPLATE2(_bitgetx, USIZE)(in, i *b, b); + } + return in + PAD8(n*b); +} + +unsigned char *TEMPLATE2(p4fdecx, USIZE)(unsigned char *in, unsigned n, uint_t *__restrict out, unsigned start) { + unsigned b,i; + struct p4 p4; + p4ini(&p4, &in, n, &b); + + if(unlikely(p4.i & 1)) { + for(i = 0; i < n; i++) + out[i] = TEMPLATE2(p4getx, USIZE)(&p4, in, i, b)+start+i+1; + } else for(i = 0; i < n; i++) out[i] = TEMPLATE2(_bitgetx, USIZE)(in, i*b, b)+start+i+1; + return in + PAD8(n*b); +} + +unsigned char *TEMPLATE2(p4f0decx, USIZE)(unsigned char *in, unsigned n, uint_t *__restrict out, unsigned start) { + unsigned b,i; + struct p4 p4; + p4ini(&p4, &in, n, &b); + + if(unlikely(p4.i & 1)) { + for(i = 0; i < n; i++) + out[i] = TEMPLATE2(p4getx, USIZE)(&p4, in, i, b)+start;// return p4.ex + PAD8((p4.cum[P4DN-1] + popcnt64(p4.xmap[P4DN-1]))*p4.bx); + } else for(i = 0; i < n; i++) out[i] = TEMPLATE2(_bitgetx, USIZE)(in, i*b, b)+start; + return in + PAD8(n*b); +} + #endif + +#endif diff --git a/vp4d.h b/vp4d.h new file mode 100644 index 0000000..83c8f49 --- /dev/null +++ b/vp4d.h @@ -0,0 +1,133 @@ +/** + Copyright (C) powturbo 2013-2017 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - homepage : https://sites.google.com/site/powturbo/ + - github : https://github.com/powturbo + - twitter : https://twitter.com/powturbo + - email : powturbo [_AT_] gmail [_DOT_] com +**/ +// "Integer Compression" Turbo PforDelta + +#ifdef __cplusplus +extern "C" { +#endif +#include +//#include + +//---------------- Bulk decompress of TurboPFor compressed integer array ------------------------------------------------------- +// decompress a previously (with p4enc32) 32/64 bits packed array. Return value = end of packed buffer in +//-- scalar. (see p4getx32 for direct access) +unsigned char *p4dec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out); +unsigned char *p4dec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out); +unsigned char *p4dec128v32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out); // SIMD (Vertical BitPacking) +unsigned char *p4dec256v32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out); +unsigned char *p4dec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out); + +// b and bx specified (not stored within the compressed stream header) +unsigned char *_p4dec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned b, unsigned bx); +unsigned char *_p4dec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned bx); +unsigned char *_p4dec128v32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned bx); // SIMD (Vertical BitPacking) +unsigned char *_p4dec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, unsigned b, unsigned bx); + +//------ Delta decoding --------------------------- Return value = end of packed input buffer in --------------------------- +//-- Increasing integer lists. out[i] = out[i-1] + in[i] +unsigned char *p4ddec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start); +unsigned char *p4ddec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); +unsigned char *p4ddec128v32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); // SIMD (Vertical BitPacking) +unsigned char *p4ddec256v32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); +unsigned char *p4ddec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, unsigned start); +// b and bx specified +unsigned char *_p4ddec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b, unsigned bx); +unsigned char *_p4ddec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned bx); +unsigned char *_p4ddec128v32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned bx); +unsigned char *_p4ddec256v32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned bx); +unsigned char *_p4ddec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, unsigned start, unsigned b, unsigned bx); + +//-- Strictly increasing (never remaining constant or decreasing) integer lists. out[i] = out[i-1] + in[i] + 1 +unsigned char *p4d1dec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start); +unsigned char *p4d1dec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); +unsigned char *p4d1dec128v32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); // SIMD (Vertical BitPacking) +unsigned char *p4d1dec256v32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start); +unsigned char *p4d1dec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, unsigned start); +// b and bx specified (see idxcr.c/idxqry.c for an example) +unsigned char *_p4d1dec16( unsigned char *__restrict in, unsigned n, unsigned short *__restrict out, unsigned start, unsigned b, unsigned bx); +unsigned char *_p4d1dec32( unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned bx); +unsigned char *_p4d1dec128v32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned bx); // SIMD (Vertical BitPacking) +unsigned char *_p4d1dec256v32(unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned bx); // SIMD (Vertical BitPacking) +unsigned char *_p4d1dec64( unsigned char *__restrict in, unsigned n, uint64_t *__restrict out, unsigned start, unsigned b, unsigned bx); + +//---------------- Direct Access functions to compressed TurboPFor array p4encx16/p4encx32 ------------------------------------------------------- +#define P4D_PAD8(_x_) ( (((_x_)+8-1)/8) ) +#define P4D_B(_x_) (((_x_) >> 1) & 0x3f) +#define P4D_XB(_x_) (((_x_) & 1)?((_x_) >> 8):0) +#define P4D_ININC(_in_, _x_) _in_ += 1+(_x_ & 1) + +#define P4D_MAX 256 // vbencx maximum integers +static inline unsigned p4bits(unsigned char *__restrict in, int *bx) { unsigned i = ctou16(in); *bx = P4D_XB(i); return P4D_B(i); } + +struct p4 { + unsigned long long *xmap; + unsigned char *ex; + unsigned i,bx,cum[P4D_MAX/64+1]; + int oval,idx; +}; + +static unsigned long long p4xmap[P4D_MAX/64+1] = { 0 }; + +// prepare direct access usage +static inline void p4ini(struct p4 *p4, unsigned char **pin, unsigned n, unsigned *b) { unsigned char *in = *pin; + unsigned p4i = ctou16(in); + p4->i = p4i; + *b = P4D_B(p4i); + p4->bx = P4D_XB(p4i); //assert(n <= P4D_MAX); + *pin = p4->ex = ++in; + if(p4i&1) { ++in; + p4->xmap = (unsigned long long *)in; + unsigned num=0,j; + for(j=0; j < n/64; j++) { p4->cum[j] = num; num += popcnt64(ctou64(in+j*8)); } //p4->cum[j] = num; + if(n & 0x3f) num += popcnt64(ctou64(in+j*8) & ((1ull<<(n&0x3f))-1) ); + unsigned char *p; + p4->ex = p = in + (n+7)/8; + *pin = p = p4->ex+((num*p4->bx+7)/8); + } else p4->xmap = p4xmap; + p4->oval = p4->idx = -1; +} + +//---------- Get a single value with index "idx" from a "p4enc32" packed array +static ALWAYS_INLINE unsigned short p4getx16(struct p4 *p4, unsigned char *in, unsigned idx, unsigned b) { unsigned bi, cl, u = _bitgetx16(in, idx*b, b); + if(/*(*p4->i&1) &&*/ unlikely(p4->xmap[bi = idx>>6] & (1ull<<(cl = (idx & 0x3f))))) u |= _bitgetx16(p4->ex, (p4->cum[bi] + popcnt64(p4->xmap[bi] & ~((~0ull)<bx, p4->bx ) << b; + return u; +} +static ALWAYS_INLINE unsigned p4getx32(struct p4 *p4, unsigned char *in, unsigned idx, unsigned b) { unsigned bi, cl, u = _bitgetx32(in, idx*b, b),bx=p4->bx; + if(/*(p4->i&1) &&*/ unlikely(p4->xmap[bi = idx>>6] & (1ull<<(cl = (idx & 0x3f))))) u |= _bitgetx32(p4->ex, (p4->cum[bi] + popcnt64(p4->xmap[bi] & ~((~0ull)<bx ) << b; + return u; +} + +// Get the next single value greater of equal to val +static ALWAYS_INLINE unsigned short p4geqx16(struct p4 *p4, unsigned char *in, unsigned b, unsigned short val) { do p4->oval += p4getx16(p4, in, ++p4->idx, b)+1; while(p4->oval < val); return p4->oval; } +static ALWAYS_INLINE unsigned p4geqx32(struct p4 *p4, unsigned char *in, unsigned b, unsigned val) { do p4->oval += p4getx32(p4, in, ++p4->idx, b)+1; while(p4->oval < val); return p4->oval; } +//static ALWAYS_INLINE uint64_t p4geq64(struct p4 *p4, unsigned char *__restrict in, unsigned b, uint64_t val) { do p4->oval += p4getx64(p4, in, ++p4->idx, b)+1; while(p4->oval < val); return p4->oval; } + +/* like p4dec32 but using direct access. This is only a demo showing direct access usage. Use p4dec32 instead for decompressing entire blocks */ +unsigned char *p4decx32( unsigned char *in, unsigned n, unsigned *out); // unsorted +unsigned char *p4f0decx32(unsigned char *in, unsigned n, unsigned *out, unsigned start); // FOR increasing +unsigned char *p4fdecx32( unsigned char *in, unsigned n, unsigned *out, unsigned start); // FOR strictly increasing + +#ifdef __cplusplus +} +#endif diff --git a/vp4dc.c b/vp4dc.c index 40e5e39..895d7f7 100644 --- a/vp4dc.c +++ b/vp4dc.c @@ -21,7 +21,7 @@ - twitter : https://twitter.com/powturbo - email : powturbo [_AT_] gmail [_DOT_] com **/ -// vp4dd.c - "Integer Compression" Turbo PforDelta +// "Integer Compression" Turbo PforDelta #ifndef USIZE #include diff --git a/vsimple.c b/vsimple.c index 7d00764..d5dc94b 100644 --- a/vsimple.c +++ b/vsimple.c @@ -1,5 +1,5 @@ /** - Copyright (C) powturbo 2013-2015 + Copyright (C) powturbo 2013-2017 GPL v2 License This program is free software; you can redistribute it and/or modify @@ -21,7 +21,7 @@ - twitter : https://twitter.com/powturbo - email : powturbo [_AT_] gmail [_DOT_] com **/ -// vsimple.c - "Integer Compression" variable simple +// "Integer Compression" variable simple #ifndef USIZE #ifdef __SSE2__ #include @@ -98,7 +98,7 @@ unsigned char *TEMPLATE2(VSENC, USIZE)(uint_t *__restrict in, int n, unsigned ch for(m = x = TEMPLATE2(bsr, USIZE)(*ip);(r+1)*(xm = x > m?x:m) <= TEMPLATE2(s_lim, USIZE)[xm] && ip+r= 0xf) { @@ -106,7 +106,7 @@ unsigned char *TEMPLATE2(VSENC, USIZE)(uint_t *__restrict in, int n, unsigned ch if(n <= 0x100) *op++ = r; else - vbput32(op, r); + vbxput32(op, r); } else *op++ = r<<4; break; case 1: @@ -291,9 +291,9 @@ unsigned char *TEMPLATE2(VSENC, USIZE)(uint_t *__restrict in, int n, unsigned ch if(n <= 0x100) *op++ = r; else - vbput32(op, r); + vbxput32(op, r); } else *op++ = r<<4|8; - TEMPLATE2(vbput, USIZE)(op, ip[0]); + TEMPLATE2(vbxput, USIZE)(op, ip[0]); break; #endif @@ -317,8 +317,7 @@ unsigned char *TEMPLATE2(VSDEC, USIZE)(unsigned char *__restrict ip, int n, uint if(unlikely(r == 0xf)) { if(n <= 0x100) r = (w>>8)&0xff, ip++; - else - r = vbget32(ip); + else { vbxget32(ip, r); } } uint_t *q = op; op += r+1; #if defined(__SSE2__) @@ -440,10 +439,9 @@ unsigned char *TEMPLATE2(VSDEC, USIZE)(unsigned char *__restrict ip, int n, uint if(unlikely(r == 0xf)) { if(n <= 0x100) r = (w>>8)&0xff, ip++; - else - r = vbget32(ip); + else { vbxget32(ip, r); } } - uint_t u = TEMPLATE2(vbget, USIZE)(ip), *q=op; op += r+1; + uint_t *q=op,u; op += r+1; TEMPLATE2(vbxget, USIZE)(ip,u); #if defined(__SSE2__) && USIZE == 32 __m128i v = _mm_set1_epi32(u); while(q < op) { diff --git a/vsimple.h b/vsimple.h index 8109d7c..805c6d8 100644 --- a/vsimple.h +++ b/vsimple.h @@ -1,5 +1,5 @@ /** - Copyright (C) powturbo 2013-2015 + Copyright (C) powturbo 2013-2017 GPL v2 License This program is free software; you can redistribute it and/or modify @@ -30,15 +30,15 @@ extern "C" { #endif -// vsencNN: compress array with n unsigned (NN bits in[n]) values to the buffer out. Return value = end of compressed buffer out -unsigned char *vsenc16(unsigned short *__restrict in, int n, unsigned char *__restrict out); -unsigned char *vsenc32(unsigned *__restrict in, int n, unsigned char *__restrict out); -unsigned char *vsenc64(uint64_t *__restrict in, int n, unsigned char *__restrict out); +// vsencNN: compress array with n unsigned (NN bits in[n]) values to the buffer out. Return value = end of compressed output buffer out +unsigned char *vsenc16(unsigned short *__restrict in, int n, unsigned char *__restrict out); +unsigned char *vsenc32(unsigned *__restrict in, int n, unsigned char *__restrict out); +unsigned char *vsenc64(uint64_t *__restrict in, int n, unsigned char *__restrict out); -// vsdecNN: decompress buffer into an array of n unsigned values. Return value = end of decompressed buffer in -unsigned char *vsdec16(unsigned char *__restrict in, int n, unsigned short *__restrict out); -unsigned char *vsdec32(unsigned char *__restrict in, int n, unsigned *__restrict out); -unsigned char *vsdec64(unsigned char *__restrict in, int n, uint64_t *__restrict out); +// vsdecNN: decompress buffer into an array of n unsigned values. Return value = end of compressed input buffer in +unsigned char *vsdec16(unsigned char *__restrict in, int n, unsigned short *__restrict out); +unsigned char *vsdec32(unsigned char *__restrict in, int n, unsigned *__restrict out); +unsigned char *vsdec64(unsigned char *__restrict in, int n, uint64_t *__restrict out); #ifdef __cplusplus }